In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 데이터 로드 (GitHub Raw URL 사용)
url = "https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/titanic.csv"
data = pd.read_csv(url)
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [21]:
# 컬럼 확인
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [24]:
# 필수 컬럼 확인
required_columns = {'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch'}
if not required_columns.issubset(data.columns):
    missing_cols = required_columns - set(data.columns)
    raise ValueError(f"Error: Missing columns in dataset - {missing_cols}")

In [25]:
# 불필요한 컬럼 제거 (필수 컬럼 제외)
data = data[list(required_columns)]

# 사용할 특성 컬럼 선택 (복사하여 가공)
X = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']].copy()

# 변환 전 Sex 컬럼 값 분포 확인
print("Sex column value counts before encoding:")
print(data['Sex'].value_counts())

Sex column value counts before encoding:
Sex
male      577
female    314
Name: count, dtype: int64


In [26]:
# LabelEncoder 초기화
label_encoder = LabelEncoder()

# 특정 컬럼만 인코딩
columns_to_encode = ['Sex']
for column in columns_to_encode:
    X[column] = label_encoder.fit_transform(X[column])

# 변환 후 Sex 컬럼 값 분포 확인
print("Sex column value counts after encoding:")
print(X['Sex'].value_counts())

# 결측값 확인
missing_values = X.isnull().sum()
print("Missing values before handling:")
print(missing_values)

Sex column value counts after encoding:
Sex
1    577
0    314
Name: count, dtype: int64
Missing values before handling:
Pclass      0
Sex         0
Age       177
SibSp       0
Parch       0
dtype: int64


In [27]:
# 결측값 처리 ('Age', 'SibSp', 'Parch')
X['Age'].fillna(X['Age'].mean(), inplace=True)

# 결측값 처리 후 확인
missing_values_after = X.isnull().sum()
print("Missing values after handling:")
print(missing_values_after)

Missing values after handling:
Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
dtype: int64


In [28]:
# 레이블 설정
y = data['Survived']

# 데이터 분할 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 모델 리스트
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel='linear'),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

# 모델 학습 및 평가
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))

Logistic Regression Accuracy: 0.8212
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Confusion Matrix for Logistic Regression:
[[92 13]
 [19 55]]
Random Forest Accuracy: 0.8268
              precision    recall  f1-score   support

           0       0.84      0.87      0.85       105
           1       0.80      0.77      0.79        74

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

Confusion Matrix for Random Forest:
[[91 14]
 [17 57]]
Decision Tree Accuracy: 0.7598
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       105
           1       0.7