In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

thresholds = [i/10 for i in range(1, 10)]
fig, axes = plt.subplots(3, 3, figsize=(12, 12))

for i, thr in enumerate(thresholds):
    r, c = i//3, i%3
    y_pred = (y_proba >= thr).astype(int)
    cm = confusion_matrix(y_valid, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cbar=False, ax=axes[r, c])
    axes[r, c].set_title(f"Threshold={thr:.1f}")
    axes[r, c].set_xlabel("Pred")
    axes[r, c].set_ylabel("True")

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, precision_recall_curve, auc

# Accuracy vs Threshold
accs = []
for thr in thresholds:
    y_pred = (y_proba >= thr).astype(int)
    accs.append(accuracy_score(y_valid, y_pred))

plt.figure(figsize=(6,4))
plt.plot(thresholds, accs, marker="o")
plt.xlabel("Threshold")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Threshold (RandomForest)")
plt.grid(True)
plt.show()

# ROC
fpr, tpr, _ = roc_curve(y_valid, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"ROC AUC={roc_auc:.3f}")
plt.plot([0,1], [0,1], "--", color="gray")
plt.xlabel("FPR"); plt.ylabel("TPR")
plt.title("ROC Curve (RandomForest)")
plt.legend()
plt.grid(True)
plt.show()

# PR
precision, recall, _ = precision_recall_curve(y_valid, y_proba)
pr_auc = auc(recall, precision)

plt.figure(figsize=(6,4))
plt.plot(recall, precision, label=f"PR AUC={pr_auc:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("Precision-Recall Curve (RandomForest)")
plt.legend()
plt.grid(True)
plt.show()

print("ROC AUC:", roc_auc)
print("PR AUC:", pr_auc)

In [None]:
import shap
shap.initjs()

X_sample = X_valid.sample(n=min(200, len(X_valid)), random_state=42)

explainer = shap.TreeExplainer(rf)

sv = explainer(X_sample)

vals = sv.values
if vals.ndim == 3:  
    vals = vals[:, :, 1]  

shap.summary_plot(vals, X_sample, plot_type="bar")

In [None]:
# 3개만 뽑기
idxs = X_sample.index[:3]
sv_single = explainer(X_valid.loc[idxs])

for i in range(len(idxs)):
    vals_i = sv_single.values[i]
    if vals_i.ndim == 2:  # (p, classes)
        vals_i = vals_i[:, 1]
        base = sv_single.base_values[i][1]
    else:
        base = sv_single.base_values[i]
    shap.plots.waterfall(shap.Explanation(values=vals_i,
                                          base_values=base,
                                          data=X_valid.loc[idxs].iloc[i].values,
                                          feature_names=X_valid.columns))

In [None]:
idxs = X_sample.index[:3]

for idx in idxs:
    true_y = y_valid.loc[idx]
    proba = rf.predict_proba(X_valid.loc[[idx]])[0, 1]
    print(f"index={idx} | true={true_y} | proba_survived={proba:.3f}")

In [None]:
test_proba = rf.predict_proba(test_proc)[:, 1]
test_pred = (test_proba >= 0.5).astype(int)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test_pred
})

submission.to_csv("/content/submission_rf.csv", index=False)
print("Saved: /content/submission_rf.csv")
submission.head()

In [None]:
from google.colab import files
files.download("/content/submission_rf.csv")

In [None]:
import os

os.makedirs("data/interim", exist_ok=True)
data.to_csv("data/interim/titanic_cleaned.csv", index=False)

In [None]:
import os

# processed 폴더 생성
os.makedirs("data/processed", exist_ok=True)

# 모델 입력용 feature 데이터 저장 (Survived 제외)
X.to_csv("data/processed/titanic_features.csv", index=False)

print("Saved: data/processed/titanic_features.csv")