## Cell 1: Install + Imports

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, classification_report)
from sklearn.ensemble import RandomForestClassifier
import joblib

## Data Loading (5 Marks)

In [25]:
df = sns.load_dataset("titanic")  # loads Titanic dataset
print(df.shape)
df.head()


(891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## save CSV to later upload to GitHub/HF:

In [26]:
df.to_csv("titanic.csv", index=False)

## Data Preprocessing (10 Marks) — at least 5 steps

In [27]:
data = df.copy()

# Step 1: drop columns not needed / redundant
drop_cols = ["alive", "who", "adult_male", "deck", "embark_town", "class"]
data = data.drop(columns=[c for c in drop_cols if c in data.columns])

# Step 4: feature engineering
data["family_size"] = data["sibsp"].fillna(0) + data["parch"].fillna(0) + 1
data["is_alone"] = (data["family_size"] == 1).astype(int)

# Target + features
y = data["survived"].astype(int)
X = data.drop(columns=["survived"])

X.head()


Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,alone,family_size,is_alone
0,3,male,22.0,1,0,7.25,S,False,2,0
1,1,female,38.0,1,0,71.2833,C,False,2,0
2,3,female,26.0,0,0,7.925,S,True,1,1
3,1,female,35.0,1,0,53.1,S,False,2,0
4,3,male,35.0,0,0,8.05,S,True,1,1


In [28]:
# Outlier clip transformer (Step 3)

def clip_outliers(X_num_array):
    # X_num_array is expected to be a numpy array
    X_num_clipped = X_num_array.copy()
    # clip numeric columns by percentile (winsorize-like)
    for i in range(X_num_clipped.shape[1]): # Iterate through columns by index
        col_data = X_num_clipped[:, i]
        lo, hi = np.nanpercentile(col_data, [1, 99])
        X_num_clipped[:, i] = np.clip(col_data, lo, hi)
    return X_num_clipped

clipper = FunctionTransformer(clip_outliers, feature_names_out="one-to-one")

## Pipeline Creation (10 Marks)

In [29]:
num_features = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object","category","bool"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),       # Step 2: missing values
    ("clipper", clipper),                                # Step 3: outlier clipping
    ("scaler", StandardScaler())                         # Step 6: scaling
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), # Step 2: missing values
    ("onehot", OneHotEncoder(handle_unknown="ignore"))    # Step 5: encoding
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ]
)

model = RandomForestClassifier(random_state=42)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

pipe


## Primary Model Selection (5 Marks) — Justification

Why RandomForest?

*   Works well on mixed tabular data (numeric + categorical after encoding)
*   Captures nonlinear patterns and feature interactions
*   Less sensitive to scaling/noise, strong baseline for classification




## Model Training (10 Marks)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)
print("Trained!")


Trained!


## Cross-Validation (10 Marks)

In [31]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy")

print("CV Accuracy Mean:", scores.mean())
print("CV Accuracy Std :", scores.std())

CV Accuracy Mean: 0.7921304048064611
CV Accuracy Std : 0.02466329848960756


## Hyperparameter Tuning (10 Marks)

In [32]:
param_grid = {
    "model__n_estimators": [100, 200, 400],
    "model__max_depth": [None, 5, 10, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best CV Score:", grid.best_score_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Params: {'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 10, 'model__n_estimators': 200}
Best CV Score: 0.8188417216586231


## Best Model Selection (10 Marks)

In [33]:
best_model = grid.best_estimator_
best_model


## Model Performance Evaluation (10 Marks)

In [34]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1       :", f1_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy : 0.8212290502793296
Precision: 0.8627450980392157
Recall   : 0.6376811594202898
F1       : 0.7333333333333333
ROC-AUC  : 0.8425559947299077

Confusion Matrix:
 [[103   7]
 [ 25  44]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.94      0.87       110
           1       0.86      0.64      0.73        69

    accuracy                           0.82       179
   macro avg       0.83      0.79      0.80       179
weighted avg       0.83      0.82      0.81       179



## Save model for GitHub + Hugging Face

In [35]:
joblib.dump(best_model, "model.joblib")
print("Saved: model.joblib")


Saved: model.joblib
