In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10, 6))

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [2]:
def preprocess(file_path='data/titanic.csv'):
    """빠른 데이터 전처리"""
    df = pd.read_csv(file_path)
    
    # 필수 전처리만 수행
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    X = df[features]
    y = df['Survived']
    
    return X, y

In [None]:
def evaluate_model(model, X_test, y_test, model_name="model"):
    # 예측
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # 기본 지표    
    accuracy = accuracy_score(y_test, y_pred)    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    # 혼동 행렬
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # 결과 출력
    print(f"정확도: {accuracy}")
    print(f"정밀도: {precision}")
    print(f"재현율: {recall}")
    print(f"f1: {f1}")
    print(f"ROC-AUC: {roc_auc}")
    print(f"- TN: {tn} FP : {fp}")
    print(f"- FN: {fn} TP : {tp}")

    # TODO: return 하셔야 됩니다.

In [None]:
X, y = preprocess("data/titanic.csv")

# stratify
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)
evaluate_model(rf, X_test, y_test, "Random Forest")

정확도: 0.8156424581005587
정밀도: 0.78125
재현율: 0.7246376811594203
f1: 0.7518796992481203
ROC-AUC: 0.7986824769433466
- TN: 96 FP : 14
- FN: 19 TP : 50
