1. Build pipelines for all the machine learning models you have built since you started this course.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import os

df = pd.read_csv('titanic.csv')
# print(f'\ninitial header: {df.columns.tolist()}')
df.insert(0, 'passengerId', range(1, len(df) + 1))
# print(f'new header: {df.columns.tolist()}')
# print(df.dtypes)
# print(df.head())


cols_keep = ["passengerId","sex","age","sibsp","parch","fare","embarked","survived","class"]
df = df[cols_keep]
# print(df.columns)

# Preprocessing setup
numeric_features = ["age","sibsp","parch","fare"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_features = ["sex","embarked"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Prepare X, y
X = df.drop(columns=["passengerId","survived"])
y = df["survived"]

x_pre = preprocessor.fit_transform(X)

cat_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(categorical_features)
feature_names = numeric_features + list(cat_names)

x_train, x_test, y_train, y_test = train_test_split(x_pre, y, test_size=0.2, random_state=42, stratify=y)

# Models
lr = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)

lr.fit(x_train, y_train)
rf.fit(x_train, y_train)

y_pred_lr = lr.predict(x_test)
y_proba_lr = lr.predict_proba(x_test)[:, 1]
y_pred_rf = rf.predict(x_test)
y_proba_rf = rf.predict_proba(x_test)[:, 1]

def eval_model(name, y_true, y_pred, y_proba=None):
    acc = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba)
    print(f"{name} Accuracy: {acc:.4f}")
    print(f"{name} ROC AUC: {roc_auc:.4f}")
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        # plt.figure()
        # plt.plot(fpr, tpr, label=f'{name} (area = {roc_auc:.2f})')
        # plt.plot([0, 1], [0, 1], 'k--')
        # plt.xlabel('False Positive Rate')
        # plt.ylabel('True Positive Rate')
        # plt.title(f'ROC Curve - {name}')
        # plt.legend(loc='lower right')
        # plt.show()

eval_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr)
eval_model("Random Forest", y_test, y_pred_rf, y_proba_rf)

print("\nCross-val (5-fold) accuracy:")
print("LogReg:", cross_val_score(lr, x_pre, y, cv=5, scoring="accuracy").mean().round(4))
print("RandomForest:", cross_val_score(rf, x_pre, y, cv=5, scoring="accuracy").mean().round(4))

importances = rf.feature_importances_
feat_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values('Importance', ascending=False)
# plt.figure(figsize=(10,6))
# plt.barh(feat_imp['Feature'], feat_imp['Importance'])
# plt.xlabel('Importance')
# plt.title('Feature Importances from Random Forest')
# plt.show()

fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)
# plt.figure()
# plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (area = {roc_auc_score(y_test, y_proba_lr):.2f})')
# plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (area = {roc_auc_score(y_test, y_proba_rf):.2f})')
# plt.plot([0, 1], [0, 1], 'k--')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve Comparison')
# plt.legend(loc='lower right')
# plt.show()

x_full_pre = preprocessor.transform(df.drop(columns=["passengerId","survived"]))
proba_all_rf = rf.predict_proba(x_full_pre)[:, 1]
output = pd.DataFrame({'passengerId': df['passengerId'], 'survived_Prob_RF': proba_all_rf})
output = pd.merge(df, output, on='passengerId')
output.to_csv('results.csv', index=False)
# output.to_csv('titanic_survival_probabilities.csv', index=False)

print("\nPredictions saved to 'results.csv'")