# Hyperparameter tuning and model persistence overview

This cell performs hyperparameter tuning for a RandomForest classifier using a robust cross-validation strategy and then serializes key outputs for downstream use. We begin by loading a preprocessed training dataset from `../result/processed/titanic_train_preprocessed.csv`, which should include engineered features, an integer target column named `Survived`, and an identifier `PassengerId`. The features `X` exclude both `Survived` and `PassengerId`, ensuring the model does not inadvertently learn from the ID and that the target remains separate. We set a `KFold` splitter with 10 folds, shuffled for stability, and establish a `SEED` to promote reproducibility across sampling operations.

The parameter search space covers both structural and regularization dimensions: `n_estimators`, `max_depth`, `min_samples_split`, `min_samples_leaf`, and `max_features`, along with `bootstrap`, `oob_score`, and potential class weighting. `RandomizedSearchCV` (with cv=5) explores 30 random configurations, scoring by ROC AUC to balance sensitivity across classification thresholds. Inside the outer loop, we fit the random search on each training fold subset; this provides repeated assessments that can expose instability. After the loop, we evaluate the best estimator on the last validation split (a quick sanity check), compute accuracy and ROC AUC, and then persist both the tuning summary (best parameters and CV ROC AUC) and the tuned model itself to `../result/processed` via `joblib.dump`.

Downstream notebooks can load `rf_best_all_features.pkl` to generate predictions against identically preprocessed test data. If you want a single final model fitted on the entire training set post-search (recommended for submission), you can call `.fit(X, y)` on `randomForest_best` before saving. Keep in mind that timing can be significant with larger grids; adjust `n_iter`, CV folds, or the parameter ranges to meet your runtime constraints.

In [None]:
# Hyperparameter tuning with cross-validation
from utils import plot_feature_importances
from sklearn.model_selection import RandomizedSearchCV , StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from scipy.stats import randint, uniform
import numpy as np
import pandas as pd
import joblib
import os

SEED = 42
PROCESS_PATH = '../result/processed'
MODEL_PATH = '../result/model'
PIC_PATH = '../result/pic'

# Use the already prepared train_final (one-hot)
train_final = pd.read_csv(f'{PROCESS_PATH}/titanic_train_preprocessed.csv')
X = train_final.drop(columns=['Survived', 'PassengerId'])
y = train_final['Survived']

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)

accs, aucs = [], []

for fold_idx, (train_idx, valid_idx) in enumerate(kfold.split(X, y), start=1):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    rf = RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        class_weight='balanced',
        n_jobs=-1,
        random_state=SEED,
    )
    rf.fit(X_train, y_train)

    pred = rf.predict(X_valid)
    proba = rf.predict_proba(X_valid)[:, 1]

    acc = accuracy_score(y_valid, pred)
    auc = roc_auc_score(y_valid, proba)
    accs.append(acc)
    aucs.append(auc)
    print(f'Fold {fold_idx}: accuracy={acc:.4f}, roc_auc={auc:.4f}')

print('\nCV summary (10-fold):')
print({'accuracy_mean': round(np.mean(accs), 4), 'accuracy_std': round(np.std(accs), 4),
       'roc_auc_mean': round(np.mean(aucs), 4), 'roc_auc_std': round(np.std(aucs), 4)})

# Refit on full data for deployment and save
Base_rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    class_weight='balanced',
    n_jobs=-1,
    random_state=SEED,
)
Base_rf.fit(X, y)

plot_feature_importances(Base_rf, X.columns, top_n=20, fname=f'{PIC_PATH}/rf_feature_importances_top20.png')
import time

timestamp = time.strftime("%Y%m%d-%H%M%S")
os.makedirs(PROCESS_PATH, exist_ok=True)
joblib.dump({'model': Base_rf, 'features': X.columns.tolist()}, f'{MODEL_PATH}/randomForest_Base_{timestamp}.pkl')
print(f'Saved baseline model to {MODEL_PATH}/randomForest_Base_{timestamp}.pkl')

In [None]:

train_final = pd.read_csv(f'{PROCESS_PATH}/titanic_train_preprocessed.csv')
X = train_final.drop(columns=['Survived', 'PassengerId'])
y = train_final['Survived']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

param_dist = {
    'n_estimators': randint(300, 900),
    'max_depth': [None] + list(range(4, 13)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt','log2', 0.5, 0.7],
    'bootstrap': [True],
    'class_weight': ['balanced', None],
    'oob_score': [True, False],
}


randomForest_tuner = RandomForestClassifier(n_jobs=-1, random_state=SEED)
randomSearchCV = RandomizedSearchCV(
    randomForest_tuner , param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv = StratifiedKFold(n_splits=10), random_state=SEED, n_jobs=-1, verbose=1, refit=True
)

randomSearchCV.fit(X_train, y_train)
# print('score: ', randomSearchCV.cv_results_)

randomForest_best = randomSearchCV.best_estimator_
print('Best params:',  randomSearchCV.best_params_)
print('Best CV ROC AUC:', round(randomSearchCV.best_score_,4))

# Evaluate tuned model
pred2 = randomForest_best.predict(X_valid)
proba2 = randomForest_best.predict_proba(X_valid)[:,1]
acc2 = accuracy_score(y_valid, pred2)
auc2 = roc_auc_score(y_valid, proba2)


print(f"Tuned Random Forest - Accuracy: {acc2:.4f}, ROC AUC: {auc2:.4f}")
# Ensure output directory exists
os.makedirs(PROCESS_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

plot_feature_importances(randomForest_best, X_train.columns, top_n=20, fname=f'{PIC_PATH}/rf_Best_feature_importances_top20.png')

# Save tuning results with a clearer filename
joblib.dump({'best_params': randomSearchCV.best_params_, 'best_cv_roc_auc': randomSearchCV.best_score_}, f"{PROCESS_PATH}/rf_randomized_search_cv_results_{timestamp}.pkl")
print(f"Saved tuning results to {MODEL_PATH}/rf_randomized_search_cv_results_{timestamp}.pkl")

# Save the best model trained on all features
joblib.dump({'model': randomForest_best, 'features': X.columns.tolist()}, f"{MODEL_PATH}/rf_best_all_features_{timestamp}.pkl")
print(f"Saved tuned model to {MODEL_PATH}/rf_best_all_features_{timestamp}.pkl")

In [None]:

# Load mutual information ranking computed in EDA

mi_df = pd.read_csv(f'{PROCESS_PATH}/eda_mutual_information_top30.csv')
mi_df = mi_df.rename(columns={mi_df.columns[0]: 'feature', mi_df.columns[1]: 'mi'})

# Choose top-K features (intersection with training columns to be safe)
TOP_K = 25
topk = mi_df.sort_values('mi', ascending=False).head(TOP_K)['feature'].tolist()
selected_cols = [c for c in topk if c in X.columns]
if len(selected_cols) < max(10, TOP_K//2):
    # Fallback: if few overlap (naming drift), just keep numeric + high-signal basics
    baseline_keep = [c for c in X.columns if any(p in c for p in ['Sex','Pclass','Fare','Age','FamilySize','IsAlone','Embarked','Title'])]
    selected_cols = sorted(set(selected_cols + baseline_keep))
for c in selected_cols:
    if c not in X.columns:
        print(f'Warning: selected feature {c} not in training data columns.')
print(f'Selected {len(selected_cols)} features for RF (EDA-informed).')

X_train_sel = X_train[selected_cols].copy()
X_valid_sel = X_valid[selected_cols].copy()

# EDA-informed RF configuration (guided ranges from EDA)
rf_eda = RandomForestClassifier(
    n_estimators=randomSearchCV.best_params_['n_estimators'],
    max_depth=randomSearchCV.best_params_['max_depth'],
    min_samples_split=randomSearchCV.best_params_['min_samples_split'],
    min_samples_leaf=randomSearchCV.best_params_['min_samples_leaf'],
    max_features=randomSearchCV.best_params_['max_features'],
    class_weight=randomSearchCV.best_params_['class_weight'],
    bootstrap=randomSearchCV.best_params_['bootstrap'],
    oob_score=randomSearchCV.best_params_['oob_score'],
    n_jobs=-1,
    random_state=SEED
)
rf_eda.fit(X_train_sel, y_train)

pred = rf_eda.predict(X_valid_sel)
proba = rf_eda.predict_proba(X_valid_sel)[:,1]
acc = accuracy_score(y_valid, pred)
auc = roc_auc_score(y_valid, proba)
oob = rf_eda.oob_score_ if rf_eda.oob_score else np.nan
print({'mi_randomForest_accuracy': round(acc,4), 'mi_randomForest_roc_auc': round(auc,4)})
print('\nClassification report (EDA RF):\n', classification_report(y_valid, pred, digits=3))
print('\nConfusion matrix (EDA RF):\n', confusion_matrix(y_valid, pred))

plot_feature_importances(rf_eda, X_train_sel.columns, top_n=20, fname=f'{PIC_PATH}/mi_best25_rf_feature_importances_top20.png')

# Persist model and feature list
os.makedirs(PROCESS_PATH, exist_ok=True)
joblib.dump({'model': rf_eda, 'features': selected_cols}, f"{MODEL_PATH}/mi_randomForest_{TOP_K}_{timestamp}_features.pkl")
print(f"Saved EDA-tuned model to {MODEL_PATH}/mi_randomForest_{TOP_K}_{timestamp}_features.pkl")

In [None]:
print(timestamp)