In [None]:
# =============================================================
# Titanic - Binary Classification
# Author: Yassine KHERBOUCHE (@yassinemimis)
# =============================================================

# ── 1. IMPORTS ───────────────────────────────────────────────
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (classification_report, confusion_matrix,
                             f1_score, roc_auc_score)
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Reproducibility
import random
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

#2. LOAD DATA 
train_df = pd.read_csv('../data/train.csv')
test_df  = pd.read_csv('../data/test.csv')

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
train_df.head()

# 3. EXPLORATORY DATA ANALYSIS 
print("\nMissing values:\n", train_df.isnull().sum())
print("\nSurvival rate:", train_df['Survived'].mean().round(3))

# Survival by Sex
sns.barplot(x='Sex', y='Survived', data=train_df)
plt.title('Survival Rate by Sex')
plt.savefig('../data/survival_by_sex.png')
plt.show()

# 4. PREPROCESSING
def preprocess(df, is_train=True):
    df = df.copy()

    # 4.1 Fill missing Age with median
    df['Age'] = df['Age'].fillna(df['Age'].median())

    # 4.2 Fill missing Embarked with mode
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # 4.3 Fill missing Fare with median
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # 4.4 Feature Engineering
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone']    = (df['FamilySize'] == 1).astype(int)
    df['Title']      = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    df['Title']      = df['Title'].replace(
        ['Lady','Countess','Capt','Col','Don','Dr',
         'Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
    df['Title']      = df['Title'].replace({'Mlle':'Miss',
                                            'Ms':'Miss',
                                            'Mme':'Mrs'})

    # 4.5 Encode categoricals
    le = LabelEncoder()
    df['Sex']      = le.fit_transform(df['Sex'])
    df['Embarked'] = le.fit_transform(df['Embarked'])
    df['Title']    = le.fit_transform(df['Title'])

    # 4.6 Select features
    features = ['Pclass','Sex','Age','Fare','Embarked',
                'FamilySize','IsAlone','Title']
    X = df[features]

    if is_train:
        y = df['Survived']
        return X, y
    return X

X, y = preprocess(train_df, is_train=True)
X_test_final = preprocess(test_df, is_train=False)

# 5. TRAIN / VAL SPLIT
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)

#6. MODEL SELECTION 
# Why RandomForest over LogisticRegression and SVM?
# - Handles non-linear relationships (Age*Pclass interaction)
# - Robust to outliers in Fare
# - Built-in feature importance
# Alternatives considered: LogisticRegression (too linear),
#                          SVM (sensitive to scale, slower)

model = RandomForestClassifier(
    n_estimators=200,      # tuned: range [50, 500]
    max_depth=6,           # most impactful: range [3, 10]
    min_samples_split=4,
    random_state=SEED
)

model.fit(X_train_s, y_train)

# 7. EVALUATION 
y_pred = model.predict(X_val_s)
y_prob = model.predict_proba(X_val_s)[:, 1]

print("\n── Validation Results ──────────────────────────")
print(classification_report(y_val, y_pred))
print("F1  Score :", f1_score(y_val, y_pred).round(4))
print("ROC-AUC   :", roc_auc_score(y_val, y_prob).round(4))

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Survived','Survived'],
            yticklabels=['Not Survived','Survived'])
plt.title('Confusion Matrix')
plt.savefig('../data/confusion_matrix.png')
plt.show()

#  8. ERROR ANALYSIS
val_df = X_val.copy()
val_df['y_true'] = y_val.values
val_df['y_pred'] = y_pred

# False Negatives: survived but predicted dead
fn = val_df[(val_df['y_true']==1) & (val_df['y_pred']==0)]
print("\nFalse Negatives (survived → predicted dead):")
print(fn[['Pclass','Sex','Age','FamilySize']].head(5))

# 9. CROSS-VALIDATION 
cv_scores = cross_val_score(model, X_train_s, y_train,
                            cv=5, scoring='f1')
print("\n5-Fold CV F1:", cv_scores.round(4))
print("Mean F1     :", cv_scores.mean().round(4))

#10. FEATURE IMPORTANCE
feat_imp = pd.Series(model.feature_importances_,
                     index=X.columns).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importance')
plt.tight_layout()
plt.savefig('../data/feature_importance.png')
plt.show()

#  11. SAVE MODEL
import joblib
joblib.dump(model,  '../data/rf_titanic_v1.pkl')
joblib.dump(scaler, '../data/scaler_v1.pkl')
print("\nModel saved: rf_titanic_v1.pkl")
print("Checkpoint : rf_titanic_v1.pkl | F1=", 
      f1_score(y_val, y_pred).round(4))