# Taitanic 생존 예측
`titanic.ipynb`

In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

df = sns.load_dataset("titanic")

In [2]:
df["sex"] = df["sex"].map({"female": 0, "male": 1})

# 1. sibsp와 parch를 합쳐 family 칼럼 만들기
df["family"] = df["sibsp"] + df["parch"]

# 2. class 칼럼 문자열을 숫자로 변환 (First:1, Second:2, Third:3)
df["class"] = df["class"].map({"First": 1, "Second": 2, "Third": 3})

# 3. adult_male 칼럼을 1과 0으로 변환 (True:1, False:0)
df["adult_male"] = df["adult_male"].astype(int)

# 4. alone 칼럼을 1과 0으로 변환 (True:1, False:0)
df["alone"] = df["alone"].astype(int)

df = df.drop(columns=["sibsp", "parch", "embarked", "deck", "embark_town", "alive"])

In [3]:
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    891 non-null    int64   
 1   pclass      891 non-null    int64   
 2   sex         891 non-null    int64   
 3   age         714 non-null    float64 
 4   fare        891 non-null    float64 
 5   class       891 non-null    category
 6   who         891 non-null    object  
 7   adult_male  891 non-null    int64   
 8   alone       891 non-null    int64   
 9   family      891 non-null    int64   
dtypes: category(1), float64(2), int64(6), object(1)
memory usage: 63.8+ KB


Unnamed: 0,survived,pclass,sex,age,fare,adult_male,alone,family
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,32.204208,0.602694,0.602694,0.904602
std,0.486592,0.836071,0.47799,14.526497,49.693429,0.489615,0.489615,1.613459
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.125,7.9104,0.0,0.0,0.0
50%,0.0,3.0,1.0,28.0,14.4542,1.0,1.0,0.0
75%,1.0,3.0,1.0,38.0,31.0,1.0,1.0,1.0
max,1.0,3.0,1.0,80.0,512.3292,1.0,1.0,10.0


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# 가정: df는 이미 로드된 Titanic DataFrame

# 1. age 결측치 여부 컬럼 생성
df["age_missing"] = df["age"].isna()

# 2. age가 결측치가 아닌 데이터만 추출하여 학습용 데이터셋 구성
train_df = df[df["age_missing"] == False].copy()

# 3. 특성(X)와 목표(y) 분리
feature_cols = ["pclass", "sex", "fare", "class", "adult_male", "alone", "family"]
# 범주형 sex, class 컬럼을 수치형으로 변환 (예: one-hot 또는 레이블 인코딩)
train_X = pd.get_dummies(train_df[feature_cols], drop_first=True)
train_y = train_df["age"]

# 4. 학습:테스트 분리 (훈련 80%, 테스트 20%)
X_train, X_valid, y_train, y_valid = train_test_split(
    train_X, train_y, test_size=0.2, random_state=42
)

# 5. 의사결정트리 회귀 모델 학습
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

# 6. 성능 평가 (옵션)
print("Validation R^2:", dt_reg.score(X_valid, y_valid))

# 7. 전체 학습 데이터로 재학습 (필요 시)
dt_reg_full = DecisionTreeRegressor(random_state=42)
dt_reg_full.fit(train_X, train_y)

# 8. age가 결측치인 행에 대해 동일한 전처리 후 예측
predict_df = df[df["age_missing"] == True].copy()
predict_X = pd.get_dummies(predict_df[feature_cols], drop_first=True)

# 학습 시 사용된 더미 변수 컬럼과 맞추기
for col in train_X.columns:
    if col not in predict_X.columns:
        predict_X[col] = 0
predict_X = predict_X[train_X.columns]

# 9. 결측치 예측 및 채우기
predicted_ages = dt_reg_full.predict(predict_X)
df.loc[df["age_missing"], "age"] = predicted_ages

# 10. 불필요한 컬럼 제거
df.drop(columns=["age_missing"], inplace=True)

# 결과 확인
print(df["age"].isna().sum(), "missing ages after imputation")  # 0이어야 함

Validation R^2: -0.2983395096435024
0 missing ages after imputation


In [5]:
from sklearn.model_selection import train_test_split

# 데이터 분리
X = df[
    ["pclass", "sex", "age", "fare", "class", "adult_male", "alone", "family"]
].to_numpy()
y = df["survived"].to_numpy()

# 훈련:테스트 = 8:2
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

# 스케일링
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 가정: df는 이미 로드된 Titanic DataFrame

# 1. 특성(X)와 목표(y) 분리
feature_cols = [
    "pclass",
    "sex",
    "age",
    "fare",
    "class",
    "adult_male",
    "alone",
    "family",
    "who",
]
X = df[feature_cols].copy()
y = df["survived"]

# 2. 범주형 변수 인코딩
X = pd.get_dummies(X, drop_first=True)

# 3. 훈련:테스트 분리 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. 의사결정트리 분류기 학습 criterion='entropy',
dt_clf = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=5,  # 최대 깊이 8까지 허용
    min_samples_split=2,  # 분할하려면 최소 5개 샘플만 필요
    min_samples_leaf=2,  # 리프 노드에 최소 2개 샘플만 필요
    random_state=42,
)
dt_clf.fit(X_train_scaled, y_train)

# 6. 예측
y_train_pred = dt_clf.predict(X_train_scaled)
y_test_pred = dt_clf.predict(X_test_scaled)

# 7. 성능 평가
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:     {test_acc:.4f}\n")

print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))

Training Accuracy: 0.8371
Test Accuracy:     0.8212

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Confusion Matrix (Test Set):
[[92 13]
 [19 55]]


## KNN 분류


## Logistic회귀 분류


## 결정트리 분류
