In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

pd.set_option("display.max_columns", 50)

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# ----- Feature prep helpers -----
def _split_cabin(x):
    if pd.isna(x):
        return pd.Series({"CabinDeck": np.nan, "CabinNum": np.nan, "CabinSide": np.nan})
    parts = str(x).split("/")
    deck = parts[0] if len(parts) > 0 else np.nan
    num  = pd.to_numeric(parts[1], errors="coerce") if len(parts) > 1 else np.nan
    side = parts[2] if len(parts) > 2 else np.nan
    return pd.Series({"CabinDeck": deck, "CabinNum": num, "CabinSide": side})

def prepare(df):
    df = df.copy()
    # Cabin parsed
    df = pd.concat([df, df["Cabin"].apply(_split_cabin)], axis=1)
    # Total spend
    spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df["TotalSpend"] = df[spend_cols].sum(axis=1, min_count=1)
    # Columns
    drop_cols = ["PassengerId","Name","Cabin","Transported"]
    feat_cols = [c for c in df.columns if c not in drop_cols]
    cat_cols  = [c for c in feat_cols if df[c].dtype == "object" or df[c].dtype == bool]
    return df, feat_cols, cat_cols

# Prep
train_prep, feat_cols, cat_cols = prepare(train)
test_prep,  _,        _        = prepare(test)

X = train_prep[feat_cols]
y = train["Transported"].astype(int)

cat_idx = [X.columns.get_loc(c) for c in cat_cols]

print("X shape:", X.shape, "| test shape:", test_prep[feat_cols].shape)
print("n_features:", len(feat_cols), "| n_categorical:", len(cat_cols))
print("first 3 feature columns:", feat_cols[:3])
X.head(3)


X shape: (8693, 14) | test shape: (4277, 14)
n_features: 14 | n_categorical: 6
first 3 feature columns: ['HomePlanet', 'CryoSleep', 'Destination']


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,CabinNum,CabinSide,TotalSpend
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,0.0,P,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,0.0,S,736.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,0.0,S,10383.0


In [2]:
# 5-fold Stratified CV with CatBoost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import numpy as np

# CatBoost needs categorical NaNs converted to strings
X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs, best_ns = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 1):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=100
    )

    model.fit(
        X_tr, y_tr,
        cat_features=cat_idx,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=200
    )

    pred_va = model.predict(X_va)
    acc = accuracy_score(y_va, pred_va)
    accs.append(acc)
    best_ns.append(model.get_best_iteration() or model.tree_count_)
    print(f"Fold {fold}: acc={acc:.5f}, best_iter={best_ns[-1]}")

print("\nCV Accuracy: mean={:.5f}, std={:.5f}".format(np.mean(accs), np.std(accs)))
print("Best iters per fold:", best_ns)


0:	learn: 0.7773943	test: 0.7734330	best: 0.7734330 (0)	total: 263ms	remaining: 8m 46s
100:	learn: 0.8402358	test: 0.8251869	best: 0.8251869 (100)	total: 13.3s	remaining: 4m 10s
200:	learn: 0.8845269	test: 0.8211616	best: 0.8257619 (102)	total: 26.7s	remaining: 3m 58s
300:	learn: 0.9105551	test: 0.8228867	best: 0.8257619 (102)	total: 40.2s	remaining: 3m 47s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8257619321
bestIteration = 102

Shrink model to first 103 iterations.
Fold 1: acc=0.82576, best_iter=102
0:	learn: 0.7824274	test: 0.7659574	best: 0.7659574 (0)	total: 123ms	remaining: 4m 5s
100:	learn: 0.8457003	test: 0.8056354	best: 0.8085106 (90)	total: 12.7s	remaining: 3m 59s
200:	learn: 0.8805004	test: 0.8050604	best: 0.8096607 (143)	total: 25.7s	remaining: 3m 49s
300:	learn: 0.9020708	test: 0.8050604	best: 0.8096607 (143)	total: 39.6s	remaining: 3m 43s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8096607246
bestIteration = 143

Shrink 

In [3]:
# Refit on ALL data using CV-informed n_estimators (median of best per fold)
import numpy as np
from catboost import CatBoostClassifier
import pandas as pd

best_n = int(np.median(best_ns))  # best_ns came from the CV cell
print("Using n_estimators:", best_n)

# Build full matrices
X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()

# Sanitize categoricals for CatBoost
for c in cat_cols:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

model_full_cv = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_cv.fit(X_all, y, cat_features=cat_idx_all)

proba = model_full_cv.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_cv = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_cv.to_csv("submission_catboost_cv.csv", index=False)
sub_cv.head()


Using n_estimators: 145


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [4]:
# Build group features from PassengerId and rebuild matrices
import pandas as pd
import numpy as np

# counts across train+test (safe: uses IDs only, no target)
all_ids = pd.concat([train["PassengerId"], test["PassengerId"]], ignore_index=True)
gid_counts = all_ids.str.split("_", n=1, expand=True)[0].value_counts()

def prepare2(df):
    df = df.copy()
    # Cabin parsed
    df = pd.concat([df, df["Cabin"].apply(_split_cabin)], axis=1)
    # Total spend
    spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df["TotalSpend"] = df[spend_cols].sum(axis=1, min_count=1)
    # Group features
    gid = df["PassengerId"].str.split("_", n=1, expand=True)[0]
    df["GroupID"] = gid
    df["GroupSize"] = gid.map(gid_counts).astype("Int64")
    df["IsAlone"] = (df["GroupSize"].fillna(1) == 1)
    # Feature lists
    drop_cols = ["PassengerId","Name","Cabin","Transported"]
    feat_cols = [c for c in df.columns if c not in drop_cols]
    cat_cols  = [c for c in feat_cols if df[c].dtype == "object" or df[c].dtype == bool]
    return df, feat_cols, cat_cols

# rebuild with new features
train_prep, feat_cols, cat_cols = prepare2(train)
test_prep,  _,        _        = prepare2(test)
X = train_prep[feat_cols]
y = train["Transported"].astype(int)

print("X shape:", X.shape, "| test shape:", test_prep[feat_cols].shape)
print("now including:", [c for c in ["GroupID","GroupSize","IsAlone"] if c in feat_cols])
X[["GroupID","GroupSize","IsAlone"]].head()


X shape: (8693, 17) | test shape: (4277, 17)
now including: ['GroupID', 'GroupSize', 'IsAlone']


Unnamed: 0,GroupID,GroupSize,IsAlone
0,1,1,True
1,2,1,True
2,3,2,False
3,3,2,False
4,4,1,True


In [5]:
# 5-fold CV with GroupID/GroupSize/IsAlone included
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import numpy as np

X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs, best_ns = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 1):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=100
    )
    model.fit(
        X_tr, y_tr,
        cat_features=cat_idx,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=200
    )

    pred_va = model.predict(X_va)
    accs.append(accuracy_score(y_va, pred_va))
    best_ns.append(model.get_best_iteration() or model.tree_count_)
    print(f"Fold {fold}: acc={accs[-1]:.5f}, best_iter={best_ns[-1]}")

print("\nCV Accuracy: mean={:.5f}, std={:.5f}".format(np.mean(accs), np.std(accs)))
print("Best iters per fold:", best_ns)


0:	learn: 0.7342537	test: 0.7441058	best: 0.7441058 (0)	total: 42.6ms	remaining: 1m 25s
100:	learn: 0.8455565	test: 0.8234618	best: 0.8234618 (94)	total: 13.9s	remaining: 4m 21s
200:	learn: 0.8901352	test: 0.8177113	best: 0.8234618 (94)	total: 28.1s	remaining: 4m 11s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8234617596
bestIteration = 94

Shrink model to first 95 iterations.
Fold 1: acc=0.82346, best_iter=94
0:	learn: 0.7496405	test: 0.7360552	best: 0.7360552 (0)	total: 128ms	remaining: 4m 16s
100:	learn: 0.8498706	test: 0.8050604	best: 0.8050604 (97)	total: 13.7s	remaining: 4m 16s
200:	learn: 0.8861087	test: 0.8079356	best: 0.8119609 (172)	total: 27.8s	remaining: 4m 9s
300:	learn: 0.9114179	test: 0.8073606	best: 0.8119609 (172)	total: 41.8s	remaining: 3m 55s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8119608971
bestIteration = 172

Shrink model to first 173 iterations.
Fold 2: acc=0.81196, best_iter=172
0:	learn: 0.7660339	test: 0.7

In [6]:
# Refit using median best iteration from CV and export submission
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

best_n = int(np.median(best_ns))  # e.g., from [94,172,201,255,255] -> 201
print("Using n_estimators:", best_n)

X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()

for c in cat_cols:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

model_full_groups = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_groups.fit(X_all, y, cat_features=cat_idx_all)

proba = model_full_groups.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_groups = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_groups.to_csv("submission_catboost_groups_cv.csv", index=False)
sub_groups.head()


Using n_estimators: 201


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [7]:
# Find best probability threshold via 5-fold CV (uses best_ns from last CV)
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

# 1) Build CV matrices with sanitized categoricals
X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

probas_val = np.zeros(len(X))
y_true = y.values.copy()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 0):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]
    n_estimators = best_ns[i] if 'best_ns' in globals() and len(best_ns)==5 else 200

    m = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=n_estimators,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=False
    )
    m.fit(X_tr, y_tr, cat_features=cat_idx)
    probas_val[va_idx] = m.predict_proba(X_va)[:, 1]

# 2) Grid search threshold for max accuracy
grid = np.linspace(0.35, 0.65, 61)  # 0.35..0.65 step 0.005
acc_05 = accuracy_score(y_true, (probas_val >= 0.5).astype(int))
best_t, best_acc = 0.5, acc_05
for t in grid:
    acc = accuracy_score(y_true, (probas_val >= t).astype(int))
    if acc > best_acc:
        best_t, best_acc = float(t), acc

print(f"Best threshold: {best_t:.3f}")
print(f"CV acc at best_t: {best_acc:.5f}")
print(f"CV acc at 0.500: {acc_05:.5f}")


Best threshold: 0.500
CV acc at best_t: 0.81353
CV acc at 0.500: 0.81353


In [8]:
# Add AgeBin, LogTotalSpend, AnySpend and rebuild features
import pandas as pd
import numpy as np

def prepare3(df):
    df = df.copy()
    # Cabin parsed
    df = pd.concat([df, df["Cabin"].apply(_split_cabin)], axis=1)
    # Spend features
    spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df["TotalSpend"] = df[spend_cols].sum(axis=1, min_count=1)
    df["LogTotalSpend"] = np.log1p(df["TotalSpend"])
    df["AnySpend"] = df["TotalSpend"].fillna(0) > 0
    # Group features (from earlier)
    gid_all = pd.concat([train["PassengerId"], test["PassengerId"]], ignore_index=True).str.split("_", n=1, expand=True)[0]
    gid_counts = gid_all.value_counts()
    gid = df["PassengerId"].str.split("_", n=1, expand=True)[0]
    df["GroupID"] = gid
    df["GroupSize"] = gid.map(gid_counts).astype("Int64")
    df["IsAlone"] = (df["GroupSize"].fillna(1) == 1)
    # Age bin (categorical)
    bins = [0, 12, 18, 25, 40, 60, 120]
    labels = ["child","teen","youngA","adult","mid","senior"]
    df["AgeBin"] = pd.cut(df["Age"], bins=bins, labels=labels, right=True, include_lowest=True)
    # Feature lists
    drop_cols = ["PassengerId","Name","Cabin","Transported"]
    feat_cols = [c for c in df.columns if c not in drop_cols]
    cat_cols  = [c for c in feat_cols if df[c].dtype == "object" or df[c].dtype == bool]
    return df, feat_cols, cat_cols

# rebuild
train_prep, feat_cols, cat_cols = prepare3(train)
test_prep,  _,        _        = prepare3(test)
X = train_prep[feat_cols]
y = train["Transported"].astype(int)

print("X shape:", X.shape, "| test shape:", test_prep[feat_cols].shape)
print("new features present:", [c for c in ["LogTotalSpend","AnySpend","AgeBin"] if c in feat_cols])


X shape: (8693, 20) | test shape: (4277, 20)
new features present: ['LogTotalSpend', 'AnySpend', 'AgeBin']


In [10]:
# compute cat_cols to include pandas 'category' dtype
def recompute_cat_cols(df, feature_columns):
    cats = []
    for c in feature_columns:
        dt = df[c].dtype
        if (dt == 'object') or (dt == bool) or (str(dt) == 'category'):
            cats.append(c)
    return cats

cat_cols = recompute_cat_cols(train_prep, feat_cols)
print("n_categorical:", len(cat_cols), "| includes AgeBin?", "AgeBin" in cat_cols)


n_categorical: 9 | includes AgeBin? True


In [11]:
# 5-fold CV after adding LogTotalSpend / AnySpend / AgeBin
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import numpy as np

X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs, best_ns = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 1):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=100
    )
    model.fit(
        X_tr, y_tr,
        cat_features=cat_idx,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=200
    )

    pred_va = model.predict(X_va)
    accs.append(accuracy_score(y_va, pred_va))
    best_ns.append(model.get_best_iteration() or model.tree_count_)
    print(f"Fold {fold}: acc={accs[-1]:.5f}, best_iter={best_ns[-1]}")

print("\nCV Accuracy: mean={:.5f}, std={:.5f}".format(np.mean(accs), np.std(accs)))
print("Best iters per fold:", best_ns)


  X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)


0:	learn: 0.7349727	test: 0.7458309	best: 0.7458309 (0)	total: 121ms	remaining: 4m 2s
100:	learn: 0.8458441	test: 0.8165612	best: 0.8228867 (75)	total: 13.7s	remaining: 4m 16s
200:	learn: 0.8886972	test: 0.8205865	best: 0.8246118 (169)	total: 28.2s	remaining: 4m 12s
300:	learn: 0.9137187	test: 0.8177113	best: 0.8251869 (244)	total: 42.8s	remaining: 4m 1s
400:	learn: 0.9318378	test: 0.8234618	best: 0.8251869 (244)	total: 57.2s	remaining: 3m 48s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.825186889
bestIteration = 244

Shrink model to first 245 iterations.
Fold 1: acc=0.82519, best_iter=244
0:	learn: 0.7382801	test: 0.7326049	best: 0.7326049 (0)	total: 76ms	remaining: 2m 31s
100:	learn: 0.8518838	test: 0.8033353	best: 0.8079356 (66)	total: 14.4s	remaining: 4m 29s
200:	learn: 0.8912856	test: 0.8056354	best: 0.8079356 (66)	total: 29s	remaining: 4m 19s
300:	learn: 0.9108427	test: 0.8073606	best: 0.8102358 (240)	total: 43.7s	remaining: 4m 6s
400:	learn: 0.9292494	tes

In [12]:
# Refit on ALL data using median(best iters) from the last CV and save submission
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

best_n = int(np.median(best_ns))  # uses the best_ns from your last CV cell
print("Using n_estimators:", best_n)

X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()

# Sanitize categoricals
for c in cat_cols:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

model_full_bins = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_bins.fit(X_all, y, cat_features=cat_idx_all)

proba = model_full_bins.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_bins = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_bins.to_csv("submission_catboost_bins_groups.csv", index=False)
sub_bins.head()


Using n_estimators: 340


  X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
  X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [13]:
# Show top features from the most recently trained full model
import pandas as pd

# pick whichever model exists
model_candidate = None
for name in ["model_full_bins", "model_full_groups", "model_full_cv", "model_full"]:
    if name in globals():
        model_candidate = globals()[name]
        break
assert model_candidate is not None, "No trained full model found; run the latest fit cell first."

# get feature names safely
try:
    feature_names = list(X_all.columns)
except NameError:
    feature_names = list(train_prep[feat_cols].columns)

imp = model_candidate.get_feature_importance(type="FeatureImportance")
fi = pd.DataFrame({"feature": feature_names, "importance": imp}).sort_values("importance", ascending=False)
fi.head(20)


Unnamed: 0,feature,importance
11,CabinNum,12.182749
10,CabinDeck,8.891193
9,VRDeck,7.898089
6,FoodCourt,7.685353
8,Spa,7.416362
14,LogTotalSpend,6.850397
0,HomePlanet,6.524278
7,ShoppingMall,5.9647
12,CabinSide,5.793055
1,CryoSleep,5.719798


In [14]:
# OOF target mean encoding for GroupID + 5-fold CV
import pandas as pd, numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

# 1) OOF target-encoding for GroupID (prevents leakage)
global_mean = y.mean()
train_gid = train_prep["GroupID"].astype(str)
test_gid  = test_prep["GroupID"].astype(str)

oof_te = pd.Series(np.nan, index=train.index, dtype=float)
skf_enc = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for tr_idx, va_idx in skf_enc.split(train_gid, y):
    gmeans = pd.DataFrame({"gid": train_gid.iloc[tr_idx], "y": y.iloc[tr_idx]}).groupby("gid")["y"].mean()
    oof_te.iloc[va_idx] = train_gid.iloc[va_idx].map(gmeans)

oof_te.fillna(global_mean, inplace=True)
gmeans_full = pd.DataFrame({"gid": train_gid, "y": y}).groupby("gid")["y"].mean()
test_te = test_gid.map(gmeans_full).fillna(global_mean)

train_prep["GroupID_TE"] = oof_te
test_prep["GroupID_TE"]  = test_te

# 2) Rebuild matrices
drop_cols = ["PassengerId","Name","Cabin","Transported"]
feat_cols = [c for c in train_prep.columns if c not in drop_cols]

def recompute_cat_cols(df, feature_columns):
    cats = []
    for c in feature_columns:
        dt = df[c].dtype
        if (dt == "object") or (dt == bool) or (str(dt) == "category"):
            cats.append(c)
    return cats

cat_cols = recompute_cat_cols(train_prep, feat_cols)

X = train_prep[feat_cols]
X_test = test_prep[feat_cols]

# 3) 5-fold CV with CatBoost
X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs, best_ns = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 1):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=100
    )
    model.fit(
        X_tr, y_tr,
        cat_features=cat_idx,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=200
    )

    pred_va = model.predict(X_va)
    accs.append(accuracy_score(y_va, pred_va))
    best_ns.append(model.get_best_iteration() or model.tree_count_)
    print(f"Fold {fold}: acc={accs[-1]:.5f}, best_iter={best_ns[-1]}")

print("\nCV Accuracy with GroupID_TE: mean={:.5f}, std={:.5f}".format(np.mean(accs), np.std(accs)))
print("Best iters per fold:", best_ns)


  X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)


0:	learn: 0.7424504	test: 0.7567568	best: 0.7567568 (0)	total: 152ms	remaining: 5m 4s
100:	learn: 0.8528904	test: 0.8194365	best: 0.8217366 (91)	total: 14.7s	remaining: 4m 36s
200:	learn: 0.8990509	test: 0.8217366	best: 0.8234618 (193)	total: 29.9s	remaining: 4m 27s
300:	learn: 0.9275237	test: 0.8217366	best: 0.8240368 (261)	total: 45.3s	remaining: 4m 15s
400:	learn: 0.9459304	test: 0.8200115	best: 0.8240368 (261)	total: 1m	remaining: 4m 1s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8240368028
bestIteration = 261

Shrink model to first 262 iterations.
Fold 1: acc=0.82404, best_iter=261
0:	learn: 0.7627265	test: 0.7498562	best: 0.7498562 (0)	total: 137ms	remaining: 4m 33s
100:	learn: 0.8629566	test: 0.8033353	best: 0.8073606 (84)	total: 15s	remaining: 4m 42s
200:	learn: 0.8953121	test: 0.8131110	best: 0.8131110 (180)	total: 30.4s	remaining: 4m 32s
300:	learn: 0.9211964	test: 0.8154112	best: 0.8177113 (271)	total: 46s	remaining: 4m 19s
400:	learn: 0.9381651	test

In [15]:
# Refit using CV-informed trees and GroupID target-encoding; save submission
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

best_n = int(np.median(best_ns))   # e.g., median of [261,380,397,233,267] -> 267
print("Using n_estimators:", best_n)

X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()

# Sanitize categoricals for CatBoost
for c in cat_cols:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

model_full_te = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_te.fit(X_all, y, cat_features=cat_idx_all)

proba = model_full_te.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_te = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_te.to_csv("submission_catboost_te.csv", index=False)
sub_te.head()


Using n_estimators: 267


  X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
  X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [16]:
# LightGBM 5-fold CV on the same features, then blend with CatBoost (model_full_te) and export
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb

# 1) Build LGBM-ready matrices (categoricals as 'category' dtype)
X_lgb = train_prep[feat_cols].copy()
T_lgb = test_prep[feat_cols].copy()
for c in cat_cols:
    X_lgb[c] = X_lgb[c].astype("category")
    T_lgb[c] = T_lgb[c].astype("category")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs_lgb, best_iters = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_lgb, y), 1):
    X_tr, X_va = X_lgb.iloc[tr_idx], X_lgb.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    m = lgb.LGBMClassifier(
        objective="binary",
        learning_rate=0.06,
        num_leaves=64,
        feature_fraction=0.9,
        bagging_fraction=0.9,
        bagging_freq=1,
        n_estimators=3000,
        random_state=42
    )
    m.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="binary_logloss",
        callbacks=[lgb.early_stopping(200, verbose=False)]
    )
    best_iters.append(m.best_iteration_)
    preds = (m.predict_proba(X_va, num_iteration=m.best_iteration_)[:,1] >= 0.5).astype(int)
    acc = accuracy_score(y_va, preds)
    accs_lgb.append(acc)
    print(f"Fold {fold}: acc={acc:.5f}, best_iter={m.best_iteration_}")

print("\nLGBM CV: mean={:.5f}, std={:.5f}".format(np.mean(accs_lgb), np.std(accs_lgb)))
best_n_lgb = int(np.median(best_iters))
print("Using n_estimators for full LGBM:", best_n_lgb)

# 2) Refit LGBM on ALL data
m_lgb_full = lgb.LGBMClassifier(
    objective="binary",
    learning_rate=0.06,
    num_leaves=64,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    n_estimators=best_n_lgb,
    random_state=42
)
m_lgb_full.fit(X_lgb, y)
proba_lgb = m_lgb_full.predict_proba(T_lgb)[:,1]

# 3) Get CatBoost test probabilities from the latest full model (model_full_te if present, else fall back)
def cat_proba_on_test():
    # build CatBoost-style matrices (categoricals as strings, NaNs -> "Missing")
    X_test_cat = test_prep[feat_cols].copy()
    for c in cat_cols:
        X_test_cat[c] = X_test_cat[c].astype("object").fillna("Missing").astype(str)
    # prefer model_full_te; fall back to model_full_bins/model_full_groups/model_full_cv/model_full
    for name in ["model_full_te", "model_full_bins", "model_full_groups", "model_full_cv", "model_full"]:
        if name in globals():
            return globals()[name].predict_proba(X_test_cat)[:,1]
    raise RuntimeError("No CatBoost full model found. Re-run a full CatBoost fit cell.")

proba_cat = cat_proba_on_test()

# 4) Simple 50/50 blend and export
proba_blend = 0.5 * proba_cat + 0.5 * proba_lgb
pred_bool = proba_blend >= 0.5

sub_blend = pd.DataFrame({"PassengerId": test["PassengerId"], "Transported": pred_bool})
sub_blend.to_csv("submission_blend_cat_lgb.csv", index=False)
sub_blend.head()


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2559
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
Fold 1: acc=0.82174, best_iter=99
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2565
[LightGBM] [Info] Number of data points in the train set: 6954, number of used featu

  X_test_cat[c] = X_test_cat[c].astype("object").fillna("Missing").astype(str)


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [17]:
# Optimize blend weight alpha for CatBoost + LightGBM using 5-fold OOF predictions
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from catboost import CatBoostClassifier

y_arr = y.values

# 1) Build CatBoost-friendly and LGB-friendly matrices
X_cb = train_prep[feat_cols].copy()
T_cb = test_prep[feat_cols].copy()
for c in cat_cols:
    X_cb[c] = X_cb[c].astype("object").fillna("Missing").astype(str)
    T_cb[c] = T_cb[c].astype("object").fillna("Missing").astype(str)

X_lgb = train_prep[feat_cols].copy()
T_lgb = test_prep[feat_cols].copy()
for c in cat_cols:
    X_lgb[c] = X_lgb[c].astype("category")
    T_lgb[c] = T_lgb[c].astype("category")

# 2) OOF probabilities for both models
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_cat = np.zeros(len(X_cb))
oof_lgb = np.zeros(len(X_lgb))

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_cb, y_arr), 1):
    # CatBoost
    cat_idx = [X_cb.columns.get_loc(c) for c in cat_cols]
    m_cb = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=False
    )
    m_cb.fit(X_cb.iloc[tr_idx], y_arr[tr_idx], cat_features=cat_idx,
             eval_set=(X_cb.iloc[va_idx], y_arr[va_idx]), use_best_model=True, early_stopping_rounds=200)
    oof_cat[va_idx] = m_cb.predict_proba(X_cb.iloc[va_idx])[:,1]

    # LightGBM
    m_lgb = lgb.LGBMClassifier(
        objective="binary",
        learning_rate=0.06,
        num_leaves=64,
        feature_fraction=0.9,
        bagging_fraction=0.9,
        bagging_freq=1,
        n_estimators=3000,
        random_state=42
    )
    m_lgb.fit(X_lgb.iloc[tr_idx], y_arr[tr_idx],
              eval_set=[(X_lgb.iloc[va_idx], y_arr[va_idx])],
              eval_metric="binary_logloss",
              callbacks=[lgb.early_stopping(200, verbose=False)])
    oof_lgb[va_idx] = m_lgb.predict_proba(X_lgb.iloc[va_idx], num_iteration=m_lgb.best_iteration_)[:,1]

# 3) Search alpha in [0.30 .. 0.70] to maximize CV accuracy
alphas = np.linspace(0.30, 0.70, 41)  # step 0.01
best_alpha, best_acc = 0.50, accuracy_score(y_arr, (0.5*oof_cat + 0.5*oof_lgb >= 0.5).astype(int))
for a in alphas:
    acc = accuracy_score(y_arr, ((a*oof_cat + (1-a)*oof_lgb) >= 0.5).astype(int))
    if acc > best_acc:
        best_alpha, best_acc = float(a), acc

print(f"Best alpha (CatBoost weight): {best_alpha:.2f}")
print(f"OOF accuracy at best alpha: {best_acc:.5f}")


  X_cb[c] = X_cb[c].astype("object").fillna("Missing").astype(str)
  T_cb[c] = T_cb[c].astype("object").fillna("Missing").astype(str)


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2559
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2565
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 21
[LightGBM] [Info] [binary:

In [18]:
# Use alpha = 0.68 (CatBoost weight) to blend CatBoost + LightGBM and export submission
import numpy as np, pandas as pd
import lightgbm as lgb

alpha = 0.68  # from the OOF search

# Ensure CatBoost-style test matrix exists
try:
    T_cb
except NameError:
    T_cb = test_prep[feat_cols].copy()
    for c in cat_cols:
        T_cb[c] = T_cb[c].astype("object").fillna("Missing").astype(str)

# Ensure LGBM-style test matrix exists
try:
    T_lgb
except NameError:
    T_lgb = test_prep[feat_cols].copy()
    for c in cat_cols:
        T_lgb[c] = T_lgb[c].astype("category")

# 1) CatBoost probabilities (use the best full model we trained)
assert "model_full_te" in globals(), "Run the cell that fits model_full_te first."
proba_cat = model_full_te.predict_proba(T_cb)[:, 1]

# 2) LightGBM full model — reuse if available, otherwise fit quickly
if "m_lgb_full" not in globals():
    # quick CV to pick a sensible n_estimators, then refit full
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    X_lgb = train_prep[feat_cols].copy()
    for c in cat_cols:
        X_lgb[c] = X_lgb[c].astype("category")
    y_arr = y.values
    iters = []
    for tr, va in skf.split(X_lgb, y_arr):
        m = lgb.LGBMClassifier(
            objective="binary",
            learning_rate=0.06,
            num_leaves=64,
            feature_fraction=0.9,
            bagging_fraction=0.9,
            bagging_freq=1,
            n_estimators=3000,
            random_state=42
        )
        m.fit(X_lgb.iloc[tr], y_arr[tr],
              eval_set=[(X_lgb.iloc[va], y_arr[va])],
              eval_metric="binary_logloss",
              callbacks=[lgb.early_stopping(200, verbose=False)])
        iters.append(m.best_iteration_ or 300)
    best_n_lgb = int(np.median(iters))
    m_lgb_full = lgb.LGBMClassifier(
        objective="binary",
        learning_rate=0.06,
        num_leaves=64,
        feature_fraction=0.9,
        bagging_fraction=0.9,
        bagging_freq=1,
        n_estimators=best_n_lgb,
        random_state=42
    )
    m_lgb_full.fit(train_prep[feat_cols].astype({c:"category" for c in cat_cols}), y)

proba_lgb = m_lgb_full.predict_proba(T_lgb)[:, 1]

# 3) Alpha-weighted blend and save
proba_blend = alpha * proba_cat + (1 - alpha) * proba_lgb
pred_bool = proba_blend >= 0.5

sub_alpha = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
fname = f"submission_blend_alpha_{int(alpha*100):02d}.csv"  # e.g., 68 -> submission_blend_alpha_68.csv
sub_alpha.to_csv(fname, index=False)
print("Saved:", fname)
sub_alpha.head()


Saved: submission_blend_alpha_68.csv


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [19]:
# Stacking: train a LogisticRegression meta-model on OOF probs of CatBoost + LightGBM,
# then predict on test and write a submission.

import numpy as np, pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from catboost import CatBoostClassifier

y_arr = y.values

# --- Ensure we have OOF probabilities for both models ---
need_oof = ('oof_cat' not in globals()) or ('oof_lgb' not in globals())

if need_oof:
    print("Recomputing OOF predictions for stacking...")
    # CatBoost-friendly matrices
    X_cb = train_prep[feat_cols].copy()
    for c in cat_cols:
        X_cb[c] = X_cb[c].astype("object").fillna("Missing").astype(str)
    # LightGBM-friendly matrices
    X_lgb = train_prep[feat_cols].copy()
    for c in cat_cols:
        X_lgb[c] = X_lgb[c].astype("category")

    oof_cat = np.zeros(len(X_cb))
    oof_lgb = np.zeros(len(X_lgb))
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for tr_idx, va_idx in skf.split(X_cb, y_arr):
        # CatBoost fold
        cat_idx = [X_cb.columns.get_loc(c) for c in cat_cols]
        m_cb = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="Accuracy",
            depth=8,
            learning_rate=0.08,
            n_estimators=2000,
            l2_leaf_reg=3,
            random_seed=42,
            verbose=False
        )
        m_cb.fit(X_cb.iloc[tr_idx], y_arr[tr_idx], cat_features=cat_idx,
                 eval_set=(X_cb.iloc[va_idx], y_arr[va_idx]),
                 use_best_model=True, early_stopping_rounds=200)
        oof_cat[va_idx] = m_cb.predict_proba(X_cb.iloc[va_idx])[:,1]

        # LightGBM fold
        m_lgb = lgb.LGBMClassifier(
            objective="binary",
            learning_rate=0.06,
            num_leaves=64,
            feature_fraction=0.9,
            bagging_fraction=0.9,
            bagging_freq=1,
            n_estimators=3000,
            random_state=42
        )
        m_lgb.fit(X_lgb.iloc[tr_idx], y_arr[tr_idx],
                  eval_set=[(X_lgb.iloc[va_idx], y_arr[va_idx])],
                  eval_metric="binary_logloss",
                  callbacks=[lgb.early_stopping(200, verbose=False)])
        oof_lgb[va_idx] = m_lgb.predict_proba(X_lgb.iloc[va_idx], num_iteration=m_lgb.best_iteration_)[:,1]

# --- Train meta-model on OOF probs ---
X_meta = np.column_stack([oof_cat, oof_lgb])
meta = LogisticRegression(max_iter=1000)
meta.fit(X_meta, y_arr)
oof_meta = meta.predict_proba(X_meta)[:,1]
print("Stack OOF acc:", accuracy_score(y_arr, (oof_meta >= 0.5).astype(int)))
print("Meta weights (CatBoost, LightGBM):", meta.coef_.ravel(), "intercept:", meta.intercept_)

# --- Prepare test probs from our fitted full models ---
# CatBoost test probs
if 'model_full_te' in globals():
    T_cb = test_prep[feat_cols].copy()
    for c in cat_cols:
        T_cb[c] = T_cb[c].astype("object").fillna("Missing").astype(str)
    P_cat = model_full_te.predict_proba(T_cb)[:,1]
else:
    raise RuntimeError("Run the cell that fits model_full_te first.")

# LightGBM test probs (reuse m_lgb_full if we already trained it)
if 'm_lgb_full' in globals():
    T_lgb = test_prep[feat_cols].copy()
    for c in cat_cols:
        T_lgb[c] = T_lgb[c].astype("category")
    P_lgb = m_lgb_full.predict_proba(T_lgb)[:,1]
else:
    # quick fit if not present
    T_lgb = test_prep[feat_cols].copy()
    for c in cat_cols:
        T_lgb[c] = T_lgb[c].astype("category")
    m_lgb_full = lgb.LGBMClassifier(
        objective="binary", learning_rate=0.06, num_leaves=64,
        feature_fraction=0.9, bagging_fraction=0.9, bagging_freq=1,
        n_estimators=300, random_state=42
    )
    m_lgb_full.fit(train_prep[feat_cols].astype({c:"category" for c in cat_cols}), y)
    P_lgb = m_lgb_full.predict_proba(T_lgb)[:,1]

# --- Meta predictions on test ---
X_test_meta = np.column_stack([P_cat, P_lgb])
proba_stack = meta.predict_proba(X_test_meta)[:,1]
pred_bool = proba_stack >= 0.5

sub_stack = pd.DataFrame({"PassengerId": test["PassengerId"], "Transported": pred_bool})
sub_stack.to_csv("submission_stack_lr_cat_lgb.csv", index=False)
sub_stack.head()


Stack OOF acc: 0.8238812837915565
Meta weights (CatBoost, LightGBM): [4.16347131 1.88300188] intercept: [-2.93603347]


  T_cb[c] = T_cb[c].astype("object").fillna("Missing").astype(str)


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [20]:
# === Adding a few light interaction features ===
import pandas as pd
def add_interactions(df):
    # 1) Deck+Side combo (captures location on ship)
    df["DeckSide"] = (df["CabinDeck"].astype(str) + "_" + df["CabinSide"].astype(str)).astype("category")
    # 2) HomePlanet + Destination combo (routes)
    df["HP_Dest"] = (df["HomePlanet"].astype(str) + "_" + df["Destination"].astype(str)).astype("category")
    # 3) Cabin number quantile bin (coarse location bucket)
    df["CabinNumBin"] = pd.qcut(df["CabinNum"], q=10, labels=False, duplicates="drop")
    df["CabinNumBin"] = df["CabinNumBin"].astype("category")
    return df

train_prep = add_interactions(train_prep)
test_prep  = add_interactions(test_prep)

# === Rebuild feature list & categorical list ===
drop_cols = ["PassengerId","Name","Cabin","Transported"]
feat_cols = [c for c in train_prep.columns if c not in drop_cols]

def recompute_cat_cols(df, feature_columns):
    cats = []
    for c in feature_columns:
        dt = df[c].dtype
        if (dt == "object") or (dt == bool) or (str(dt) == "category"):
            cats.append(c)
    return cats

cat_cols = recompute_cat_cols(train_prep, feat_cols)

# === 5-fold CV with CatBoost (same params as before) ===
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import numpy as np

X = train_prep[feat_cols].copy()
for c in cat_cols:
    X[c] = X[c].astype("object").fillna("Missing").astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs, best_ns = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=100
    )
    model.fit(
        X_tr, y_tr,
        cat_features=cat_idx,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=200
    )


  X[c] = X[c].astype("object").fillna("Missing").astype(str)


0:	learn: 0.7532355	test: 0.7579068	best: 0.7579068 (0)	total: 166ms	remaining: 5m 32s
100:	learn: 0.8530342	test: 0.8240368	best: 0.8251869 (99)	total: 15.6s	remaining: 4m 53s
200:	learn: 0.9062410	test: 0.8280621	best: 0.8332375 (170)	total: 31.7s	remaining: 4m 44s
300:	learn: 0.9351452	test: 0.8315124	best: 0.8332375 (170)	total: 48s	remaining: 4m 31s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8332374928
bestIteration = 170

Shrink model to first 171 iterations.
0:	learn: 0.7704918	test: 0.7521564	best: 0.7521564 (0)	total: 193ms	remaining: 6m 25s
100:	learn: 0.8661202	test: 0.8102358	best: 0.8131110 (97)	total: 16.3s	remaining: 5m 5s
200:	learn: 0.9035088	test: 0.8182864	best: 0.8194365 (197)	total: 32.3s	remaining: 4m 49s
300:	learn: 0.9279551	test: 0.8148361	best: 0.8194365 (197)	total: 48.4s	remaining: 4m 33s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8194364577
bestIteration = 197

Shrink model to first 198 iterations.
0:	lear

In [21]:
# Refit CatBoost on ALL data using the new interactions (DeckSide, HP_Dest, CabinNumBin)
# Use the CV-derived best number of trees
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

best_n = int(np.median(best_ns))  # use the median of the best iterations from the CV you just ran
print("Using n_estimators:", best_n)

# Build full train/test matrices and sanitize categoricals for CatBoost
X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()
for c in cat_cols:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

model_full_inter = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_inter.fit(X_all, y, cat_features=cat_idx_all)

proba = model_full_inter.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_inter = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_inter.to_csv("submission_catboost_interactions.csv", index=False)
sub_inter.head()


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: cannot convert float NaN to integer

In [22]:
# Refit CatBoost on ALL data with interaction features, using CV median if available,
# otherwise fall back to a solid default.
import numpy as np, pandas as pd
from catboost import CatBoostClassifier

# 1) pick n_estimators safely
fallback_best_n = 170  # good default based on your recent CV logs
if 'best_ns' in globals() and isinstance(best_ns, (list, tuple)) and len(best_ns) > 0:
    med = float(np.nanmedian(best_ns))
    best_n = int(med) if np.isfinite(med) else fallback_best_n
else:
    best_n = fallback_best_n

print("Using n_estimators:", best_n)

# 2) build matrices
X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()
cat_cols_current = []
for c in feat_cols:
    dt = X_all[c].dtype
    if (dt == "object") or (dt == bool) or (str(dt) == "category"):
        cat_cols_current.append(c)

for c in cat_cols_current:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols_current]

# 3) fit full model
model_full_inter = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_inter.fit(X_all, y, cat_features=cat_idx_all)

# 4) predict & save
proba = model_full_inter.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_inter = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_inter.to_csv("submission_catboost_interactions.csv", index=False)
sub_inter.head()


Using n_estimators: 170


  X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
  X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [23]:
# Seed-averaged CatBoost + LightGBM, dropping bad interaction cols, then 50/50 blend
import numpy as np, pandas as pd
from catboost import CatBoostClassifier
import lightgbm as lgb

# 0) drop the interaction columns if they exist (they hurt LB)
bad_cols = ["DeckSide", "HP_Dest", "CabinNumBin"]
train_ens = train_prep.drop(columns=[c for c in bad_cols if c in train_prep.columns]).copy()
test_ens  = test_prep.drop(columns=[c for c in bad_cols if c in test_prep.columns]).copy()

drop_cols = ["PassengerId","Name","Cabin","Transported"]
feat_ens = [c for c in train_ens.columns if c not in drop_cols]

# rebuild cat list from dtypes
def get_cat_cols(df, cols):
    out = []
    for c in cols:
        dt = df[c].dtype
        if (dt == "object") or (dt == bool) or (str(dt) == "category"):
            out.append(c)
    return out

cat_cols_ens = get_cat_cols(train_ens, feat_ens)

# 1) matrices for each library
# CatBoost expects strings for categorical
X_cb = train_ens[feat_ens].copy()
T_cb = test_ens[feat_ens].copy()
for c in cat_cols_ens:
    X_cb[c] = X_cb[c].astype("object").fillna("Missing").astype(str)
    T_cb[c] = T_cb[c].astype("object").fillna("Missing").astype(str)
cat_idx = [X_cb.columns.get_loc(c) for c in cat_cols_ens]

# LightGBM expects category dtype
X_lgb = train_ens[feat_ens].copy()
T_lgb = test_ens[feat_ens].copy()
for c in cat_cols_ens:
    X_lgb[c] = X_lgb[c].astype("category")
    T_lgb[c] = T_lgb[c].astype("category")

y_arr = y.values

# 2) pick sensible tree counts (fallbacks if not defined earlier)
cat_n  = 270  # median from your good CV earlier (approx 261–397 range)
lgb_n  = int(globals().get("best_n_lgb", 300))

seeds = [42, 202, 777]

# 3) train several seeds and average probs
proba_cat_all, proba_lgb_all = [], []

for sd in seeds:
    # CatBoost full fit
    m_cb = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=cat_n,
        l2_leaf_reg=3,
        random_seed=sd,
        verbose=False
    )
    m_cb.fit(X_cb, y_arr, cat_features=cat_idx)
    proba_cat_all.append(m_cb.predict_proba(T_cb)[:,1])

    # LightGBM full fit
    m_lgb = lgb.LGBMClassifier(
        objective="binary",
        learning_rate=0.06,
        num_leaves=64,
        feature_fraction=0.9,
        bagging_fraction=0.9,
        bagging_freq=1,
        n_estimators=lgb_n,
        random_state=sd
    )
    m_lgb.fit(X_lgb, y_arr)
    proba_lgb_all.append(m_lgb.predict_proba(T_lgb)[:,1])

proba_cat = np.mean(np.vstack(proba_cat_all), axis=0)
proba_lgb = np.mean(np.vstack(proba_lgb_all), axis=0)

# 4) 50/50 blend (your best LB so far)
proba_blend = 0.5*proba_cat + 0.5*proba_lgb
pred_bool = proba_blend >= 0.5

sub_ens = pd.DataFrame({"PassengerId": test["PassengerId"], "Transported": pred_bool})
sub_ens.to_csv("submission_seed_ens_blend_3x.csv", index=False)
print("Saved: submission_seed_ens_blend_3x.csv")
sub_ens.head()


  X_cb[c] = X_cb[c].astype("object").fillna("Missing").astype(str)
  T_cb[c] = T_cb[c].astype("object").fillna("Missing").astype(str)


[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2756
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495
[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2756
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 21
[LightGBM] [Info] [binary:

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [24]:
# CatBoost-weighted blend (alpha = 0.60)
import pandas as pd

alpha = 0.60
proba_blend_060 = alpha*proba_cat + (1-alpha)*proba_lgb
pred_bool_060 = proba_blend_060 >= 0.5

sub_060 = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool_060
})
sub_060.to_csv("submission_seed_ens_alpha_060.csv", index=False)
sub_060.head()


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [25]:
# CatBoost-weighted blend (alpha = 0.62)
import pandas as pd

alpha = 0.62
proba_blend_062 = alpha*proba_cat + (1-alpha)*proba_lgb
pred_bool_062 = proba_blend_062 >= 0.5

sub_062 = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool_062
})
sub_062.to_csv("submission_seed_ens_alpha_062.csv", index=False)
sub_062.head()


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
