In [2]:
# ============================================
# Advanced Spaceship Titanic Solution
# ============================================

import os, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from scipy import stats

!pip -q install lightgbm xgboost catboost
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ============================================
# 1) Load Data
# ============================================
BASE = "/kaggle/input/spaceship-titanic"
train = pd.read_csv(f"{BASE}/train.csv")
test = pd.read_csv(f"{BASE}/test.csv")

y = train["Transported"].astype(int)
X = train.drop(columns=["Transported"])
test_ids = test["PassengerId"].values

# ============================================
# 2) Advanced Feature Engineering
# ============================================
def engineer_advanced(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # === Basic Parsing ===
    cab = df["Cabin"].astype(str).str.split("/", expand=True)
    df["Deck"] = cab[0].replace("nan", np.nan)
    df["CabinNum"] = pd.to_numeric(cab[1], errors="coerce")
    df["Side"] = cab[2].replace("nan", np.nan)
    
    df["Group"] = df["PassengerId"].str.split("_", expand=True)[0]
    df["PersonNum"] = pd.to_numeric(df["PassengerId"].str.split("_", expand=True)[1], errors="coerce")
    df["Surname"] = df["Name"].astype(str).str.split(" ", expand=True)[1]
    df["FirstName"] = df["Name"].astype(str).str.split(" ", expand=True)[0]
    
    # === Spending Features ===
    spend_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    for c in spend_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    
    df["SpendTotal"] = df[spend_cols].sum(axis=1, skipna=True)
    df["SpendMean"] = df[spend_cols].mean(axis=1, skipna=True)
    df["SpendStd"] = df[spend_cols].std(axis=1, skipna=True).fillna(0)
    df["SpendMax"] = df[spend_cols].max(axis=1, skipna=True)
    df["SpendNonZero"] = (df[spend_cols].fillna(0) > 0).sum(axis=1)
    df["NoSpendFlag"] = (df["SpendTotal"].fillna(0) == 0).astype(int)
    
    # Spending ratios
    for c in spend_cols:
        df[f"{c}_Ratio"] = df[c] / (df["SpendTotal"] + 1)
    
    # Log transforms for skewed spending
    df["SpendTotal_Log"] = np.log1p(df["SpendTotal"])
    
    # === Group Features ===
    df["GroupSize"] = df.groupby("Group")["PassengerId"].transform("count")
    df["IsAlone"] = (df["GroupSize"] == 1).astype(int)
    df["SpendPerGroup"] = df["SpendTotal"] / df["GroupSize"]
    
    # Group aggregates
    for col in ["Age", "SpendTotal", "VIP", "CryoSleep"] + spend_cols:
        if col in df.columns:
            df[f"Group_{col}_Mean"] = df.groupby("Group")[col].transform("mean")
            df[f"Group_{col}_Std"] = df.groupby("Group")[col].transform("std").fillna(0)
            df[f"Group_{col}_Max"] = df.groupby("Group")[col].transform("max")
            df[f"Group_{col}_Min"] = df.groupby("Group")[col].transform("min")
    
    # === Surname Features ===
    df["SurnameSize"] = df.groupby("Surname")["PassengerId"].transform("count").fillna(1)
    df["Surname_Age_Mean"] = df.groupby("Surname")["Age"].transform("mean")
    df["Surname_Spend_Mean"] = df.groupby("Surname")["SpendTotal"].transform("mean")
    
    # === CryoSleep Logic ===
    df["CryoSpendContradiction"] = (
        df["CryoSleep"].fillna(False).astype(bool) & (df["SpendTotal"].fillna(0) > 0)
    ).astype(int)
    
    # === Age Features ===
    df["AgeBucket"] = pd.cut(df["Age"], bins=[-0.01, 12, 18, 25, 35, 50, 120], labels=False)
    df["IsChild"] = (df["Age"] < 18).astype(int)
    df["IsYoungAdult"] = ((df["Age"] >= 18) & (df["Age"] < 30)).astype(int)
    df["IsSenior"] = (df["Age"] >= 60).astype(int)
    
    # === Cabin Features ===
    df["CabinNumBucket"] = pd.qcut(df["CabinNum"], q=20, duplicates="drop", labels=False)
    df["CabinNum_Mod10"] = df["CabinNum"] % 10
    df["CabinNum_IsEven"] = (df["CabinNum"] % 2 == 0).astype(int)
    
    # === Interactions ===
    df["Deck_Side"] = df["Deck"].astype(str) + "_" + df["Side"].astype(str)
    df["Planet_Dest"] = df["HomePlanet"].astype(str) + "_" + df["Destination"].astype(str)
    df["Deck_Planet"] = df["Deck"].astype(str) + "_" + df["HomePlanet"].astype(str)
    df["Age_Deck"] = df["AgeBucket"].astype(str) + "_" + df["Deck"].astype(str)
    df["VIP_Planet"] = df["VIP"].astype(str) + "_" + df["HomePlanet"].astype(str)
    df["Cryo_Planet"] = df["CryoSleep"].astype(str) + "_" + df["HomePlanet"].astype(str)
    
    # === Destination-based Features ===
    df["Dest_Age_Mean"] = df.groupby("Destination")["Age"].transform("mean")
    df["Dest_Spend_Mean"] = df.groupby("Destination")["SpendTotal"].transform("mean")
    df["Planet_Age_Mean"] = df.groupby("HomePlanet")["Age"].transform("mean")
    df["Planet_Spend_Mean"] = df.groupby("HomePlanet")["SpendTotal"].transform("mean")
    
    # === Deck-based Features ===
    df["Deck_Cryo_Rate"] = df.groupby("Deck")["CryoSleep"].transform("mean")
    df["Deck_VIP_Rate"] = df.groupby("Deck")["VIP"].transform("mean")
    df["Deck_Age_Mean"] = df.groupby("Deck")["Age"].transform("mean")
    
    # === Missing value indicators ===
    for col in ["Age", "CabinNum", "HomePlanet", "Destination", "CryoSleep", "VIP"]:
        df[f"{col}_Missing"] = df[col].isna().astype(int)
    
    return df.drop(columns=["Name", "Cabin"], errors="ignore")

# Engineer features
all_df = pd.concat([X, test], ignore_index=True)
all_fe = engineer_advanced(all_df)

X_fe = all_fe.iloc[:len(X)].copy()
T_fe = all_fe.iloc[len(X):].copy()

# ============================================
# 3) Advanced Preprocessing
# ============================================
# Identify column types
num_cols = [c for c in X_fe.columns if X_fe[c].dtype in ['int64', 'float64'] 
            and c not in ['CryoSleep', 'VIP', 'IsAlone', 'PassengerId']]
bool_cols = ['CryoSleep', 'VIP', 'IsAlone', 'IsChild', 'IsYoungAdult', 'IsSenior', 
             'NoSpendFlag', 'CryoSpendContradiction', 'CabinNum_IsEven']
cat_cols = ['HomePlanet', 'Destination', 'Deck', 'Side', 'Deck_Side', 'Group', 
            'Surname', 'Planet_Dest', 'Deck_Planet', 'Age_Deck', 'VIP_Planet', 'Cryo_Planet']

# Remove high-cardinality categoricals
cat_cols = [c for c in cat_cols if c in X_fe.columns and X_fe[c].nunique() < 500]
num_cols = [c for c in num_cols if c in X_fe.columns]
bool_cols = [c for c in bool_cols if c in X_fe.columns]

X_fe[bool_cols] = X_fe[bool_cols].astype("float")
T_fe[bool_cols] = T_fe[bool_cols].astype("float")

for c in ["PassengerId", "FirstName"]:
    if c in X_fe.columns: X_fe = X_fe.drop(columns=[c])
    if c in T_fe.columns: T_fe = T_fe.drop(columns=[c])

# Pipelines with robust scaling
num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
boo_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="constant", fill_value="missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False, max_categories=50))
])

prep = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("boo", boo_pipe, bool_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)

X_mat = prep.fit_transform(X_fe)
T_mat = prep.transform(T_fe)

print(f"Feature matrix shape: {X_mat.shape}")

# ============================================
# 4) Advanced Model Configuration
# ============================================
SEED = 42
N_FOLDS = 10  # More folds for better generalization

# Optimized models
lgbm = LGBMClassifier(
    n_estimators=3000, learning_rate=0.015, num_leaves=48,
    subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0,
    min_child_samples=20, random_state=SEED, n_jobs=-1, verbose=-1
)

xgb = XGBClassifier(
    n_estimators=2500, learning_rate=0.015, max_depth=5,
    subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.5,
    min_child_weight=3, gamma=0.1,
    objective="binary:logistic", eval_metric="logloss",
    tree_method="hist", random_state=SEED, n_jobs=-1
)

cb = CatBoostClassifier(
    iterations=3500, learning_rate=0.015, depth=6, 
    l2_leaf_reg=8, bagging_temperature=0.2,
    loss_function="Logloss", eval_metric="Accuracy",
    random_seed=SEED, verbose=False, od_type="Iter", od_wait=150
)

rf = RandomForestClassifier(
    n_estimators=500, max_depth=15, min_samples_split=10,
    min_samples_leaf=4, max_features='sqrt',
    random_state=SEED, n_jobs=-1
)

# ============================================
# 5) Stratified K-Fold CV with Stacking
# ============================================
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_lgb = np.zeros(len(X_mat))
oof_xgb = np.zeros(len(X_mat))
oof_cb = np.zeros(len(X_mat))
oof_rf = np.zeros(len(X_mat))

test_lgb = np.zeros(len(T_mat))
test_xgb = np.zeros(len(T_mat))
test_cb = np.zeros(len(T_mat))
test_rf = np.zeros(len(T_mat))

print("\n=== Training Base Models ===")
for fold, (tr, va) in enumerate(skf.split(X_mat, y), 1):
    X_tr, X_va = X_mat[tr], X_mat[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]
    
    # LightGBM
    lgbm.fit(X_tr, y_tr)
    oof_lgb[va] = lgbm.predict_proba(X_va)[:, 1]
    test_lgb += lgbm.predict_proba(T_mat)[:, 1] / N_FOLDS
    
    # XGBoost
    xgb.fit(X_tr, y_tr)
    oof_xgb[va] = xgb.predict_proba(X_va)[:, 1]
    test_xgb += xgb.predict_proba(T_mat)[:, 1] / N_FOLDS
    
    # CatBoost
    cb.fit(X_tr, y_tr)
    oof_cb[va] = cb.predict_proba(X_va)[:, 1]
    test_cb += cb.predict_proba(T_mat)[:, 1] / N_FOLDS
    
    # Random Forest
    rf.fit(X_tr, y_tr)
    oof_rf[va] = rf.predict_proba(X_va)[:, 1]
    test_rf += rf.predict_proba(T_mat)[:, 1] / N_FOLDS
    
    acc_lgb = accuracy_score(y_va, (oof_lgb[va] >= 0.5).astype(int))
    acc_xgb = accuracy_score(y_va, (oof_xgb[va] >= 0.5).astype(int))
    acc_cb = accuracy_score(y_va, (oof_cb[va] >= 0.5).astype(int))
    acc_rf = accuracy_score(y_va, (oof_rf[va] >= 0.5).astype(int))
    
    print(f"Fold {fold:2d} | LGBM: {acc_lgb:.4f} | XGB: {acc_xgb:.4f} | CB: {acc_cb:.4f} | RF: {acc_rf:.4f}")

# Base model OOF scores
print(f"\n=== Base Model OOF Scores ===")
print(f"LGBM: {accuracy_score(y, (oof_lgb >= 0.5).astype(int)):.4f}")
print(f"XGB:  {accuracy_score(y, (oof_xgb >= 0.5).astype(int)):.4f}")
print(f"CB:   {accuracy_score(y, (oof_cb >= 0.5).astype(int)):.4f}")
print(f"RF:   {accuracy_score(y, (oof_rf >= 0.5).astype(int)):.4f}")

# ============================================
# 6) Meta-Model Stacking
# ============================================
meta_features_train = np.column_stack([oof_lgb, oof_xgb, oof_cb, oof_rf])
meta_features_test = np.column_stack([test_lgb, test_xgb, test_cb, test_rf])

# Train meta-learner
meta_model = LogisticRegression(C=0.1, random_state=SEED, max_iter=1000)
meta_model.fit(meta_features_train, y)

oof_meta = meta_model.predict_proba(meta_features_train)[:, 1]
test_meta = meta_model.predict_proba(meta_features_test)[:, 1]

print(f"Meta-Model OOF: {accuracy_score(y, (oof_meta >= 0.5).astype(int)):.4f}")

# ============================================
# 7) Ensemble with Threshold Optimization
# ============================================
# Weighted ensemble of meta-model and best base models
oof_ensemble = 0.6 * oof_meta + 0.25 * oof_cb + 0.15 * oof_lgb
test_ensemble = 0.6 * test_meta + 0.25 * test_cb + 0.15 * test_lgb

# Find optimal threshold
ths = np.linspace(0.35, 0.65, 61)
best_t, best_acc = 0.5, 0.0
for t in ths:
    acc = accuracy_score(y, (oof_ensemble >= t).astype(int))
    if acc > best_acc:
        best_t, best_acc = t, acc

print(f"\n=== Final Results ===")
print(f"Best OOF Accuracy: {best_acc*100:.2f}% @ threshold={best_t:.3f}")
print(f"AUC-ROC: {roc_auc_score(y, oof_ensemble):.4f}")

# ============================================
# 8) Generate Submission
# ============================================
test_pred = (test_ensemble >= best_t).astype(bool)

out_path = "/kaggle/working/submission.csv"
pd.DataFrame({
    "PassengerId": test_ids, 
    "Transported": test_pred
}).to_csv(out_path, index=False)

print(f"\nSubmission saved to: {out_path}")
print(f"Predicted True: {test_pred.sum()} ({100*test_pred.mean():.1f}%)")
print(f"Predicted False: {(~test_pred).sum()} ({100*(~test_pred).mean():.1f}%)")

Feature matrix shape: (8693, 233)

=== Training Base Models ===
Fold  1 | LGBM: 0.8092 | XGB: 0.8023 | CB: 0.8023 | RF: 0.7897
Fold  2 | LGBM: 0.8103 | XGB: 0.8276 | CB: 0.8333 | RF: 0.8218
Fold  3 | LGBM: 0.8184 | XGB: 0.8092 | CB: 0.8207 | RF: 0.8115
Fold  4 | LGBM: 0.7871 | XGB: 0.7837 | CB: 0.7906 | RF: 0.7710
Fold  5 | LGBM: 0.8170 | XGB: 0.8147 | CB: 0.8124 | RF: 0.8078
Fold  6 | LGBM: 0.7975 | XGB: 0.8124 | CB: 0.8101 | RF: 0.7952
Fold  7 | LGBM: 0.8090 | XGB: 0.8113 | CB: 0.8239 | RF: 0.8021
Fold  8 | LGBM: 0.8113 | XGB: 0.8124 | CB: 0.8205 | RF: 0.8021
Fold  9 | LGBM: 0.8021 | XGB: 0.8021 | CB: 0.8136 | RF: 0.7986
Fold 10 | LGBM: 0.7871 | XGB: 0.7940 | CB: 0.7952 | RF: 0.7940

=== Base Model OOF Scores ===
LGBM: 0.8049
XGB:  0.8070
CB:   0.8123
RF:   0.7994
Meta-Model OOF: 0.8116

=== Final Results ===
Best OOF Accuracy: 81.35% @ threshold=0.460
AUC-ROC: 0.9046

Submission saved to: /kaggle/working/submission.csv
Predicted True: 2259 (52.8%)
Predicted False: 2018 (47.2%)
