In [None]:
## End-to-end sklearn Pipeline: transform test like train and create submission
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import os
import joblib



PROCESS_PATH = '../result/processed'
MODEL_PATH = '../result/model'
PIC_PATH = '../result/pic'
SEED = 42
timestamp = input('Enter timestamp string: ')

train = pd.read_csv(f'{PROCESS_PATH}/titanic_train_preprocessed.csv')
test = pd.read_csv(f'{PROCESS_PATH}/titanic_test_preprocessed.csv')

In [None]:
# Fit on full train and predict on test
randomForest_base_pkl = joblib.load(f"{MODEL_PATH}/randomForest_Base_{timestamp}.pkl")
randomForest_best_all_features_pkl = joblib.load(f"{MODEL_PATH}/rf_best_all_features_{timestamp}.pkl")
mi_randomForest_25_features_pkl = joblib.load(f"{MODEL_PATH}/mi_randomForest_25_{timestamp}_features.pkl")

y_full = train['Survived'].astype(int)

# Optionally extract (model, features) if a pickle stores a dict/tuple
def unpack(obj):
    # supports dict {'model': ..., 'features': [...]}, tuple (model, features), or plain model
    if isinstance(obj, dict):
        return obj.get('model', obj), obj.get('features')
    if isinstance(obj, tuple) and len(obj) == 2:
        return obj[0], obj[1]
    return obj, None

rf_base_model, rf_base_feats = unpack(randomForest_base_pkl)
rf_best_model, rf_best_feats = unpack(randomForest_best_all_features_pkl)
mi_model, mi_feats_25 = unpack(mi_randomForest_25_features_pkl)

all_features = [c for c in train.columns if c != 'Survived']

models = [
    ("randomForest_base", rf_base_model, rf_base_feats or all_features),
    ("rf_best_all_features", rf_best_model, rf_best_feats or all_features),
    ("mi_randomForest_25_features", mi_model, mi_feats_25 or all_features),
]

# score each model
X= train.drop(columns=['Survived', 'PassengerId'])
y= train['Survived'].astype(int)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)

accs, aucs, save_feature = {}, {}, {}

for name, model, features in models:
    X_model = X[features]
    for fold_idx, (train_idx, valid_idx) in enumerate(kfold.split(X_model, y), start=1):
        X_train, X_valid = X_model.iloc[train_idx], X_model.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        acc = accuracy_score(y_valid, y_pred)
        auc = roc_auc_score(y_valid, y_pred)
        accs.setdefault(name, []).append(acc)
        aucs.setdefault(name, []).append(auc)
    save_feature.setdefault(name, []).append(features)
    print(f"Model: {name} - CV Accuracy: {sum(accs[name])/len(accs[name]):.4f}, CV ROC AUC: {sum(aucs[name])/len(aucs[name]):.4f}")
    accs[name + '_mean'] = sum(accs[name]) / len(accs[name])
    aucs[name + '_mean'] = sum(aucs[name]) / len(aucs[name])
    
import time
import json
acc_auc_path = f"{PROCESS_PATH}/accs-aucs.json"
start_time = time.strftime("%Y%m%d-%H%M%S")
try:
    with open(acc_auc_path, "r") as f:
        content = f.read().strip()
        data = json.loads(content) if content else []
except (FileNotFoundError, json.JSONDecodeError) as e:
    print(f"Warning: initializing {acc_auc_path}. Reason: {e}")
    data = []

print(data)
data.append({"time": start_time, "accs": accs, "aucs": aucs, "features": save_feature})

with open(acc_auc_path, "w") as f:
    json.dump(data, f)
# Create submissions
for name, model, features in models:
    X_tr = train[features]
    X_te = test[features]

    # fit only if not already fitted
    if not hasattr(model, "classes_"):
        model.fit(X_tr, y_full)

    test_pred = model.predict(X_te)
    sub = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": test_pred.astype(int)})
    os.makedirs(PROCESS_PATH, exist_ok=True)
    sub_path = f"{PROCESS_PATH}/submission_{name}.csv"
    sub.to_csv(sub_path, index=False)
    print(f"Saved submission to {sub_path}")


In [None]:
# Compare historical runs: find best mean accuracy and its features
import json, os

acc_auc_path = f"{PROCESS_PATH}/accs-aucs.json"
try:
    with open(acc_auc_path, "r") as f:
        content = f.read().strip()
        history = json.loads(content) if content else []
except (FileNotFoundError, json.JSONDecodeError) as e:
    print(f"Cannot read {acc_auc_path}: {e}")
    history = []

if not history:
    print("No history found in accs-aucs.json")
else:
    best = {
        "mean_acc": -1.0,
        "model": None,
        "auc_mean": None,
        "features": None,
        "time": None,
        "index": None,
    }
    for idx, entry in enumerate(history):
        accs = entry.get("accs", {})
        aucs = entry.get("aucs", {})
        feats = entry.get("features", {})
        for k, v in accs.items():
            if k.endswith("_mean") and isinstance(v, (int, float)):
                model = k[:-5]  # strip suffix
                mean_acc = float(v)
                if mean_acc > best["mean_acc"]:
                    # pick the last recorded features list for this model if present
                    fval = feats.get(model)
                    features_list = fval[-1] if isinstance(fval, list) and fval else None
                    best.update({
                        "mean_acc": mean_acc,
                        "model": model,
                        "auc_mean": float(aucs.get(model + "_mean")) if isinstance(aucs.get(model + "_mean"), (int, float)) else None,
                        "features": features_list,
                        "time": entry.get("time"),
                        "index": idx,
                    })
    print("Best run across history:")
    summary = {
        "time": best["time"],
        "run_index": best["index"],
        "model": best["model"],
        "mean_accuracy": round(best["mean_acc"], 4) if best["mean_acc"] is not None else None,
        "mean_auc": round(best["auc_mean"], 4) if best["auc_mean"] is not None else None,
        "n_features": len(best["features"]) if isinstance(best["features"], list) else None,
        "features": best["features"],
    }
    print(summary)