In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

pd.set_option("display.max_columns", 50)

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# ----- Feature prep helpers -----
def _split_cabin(x):
    if pd.isna(x):
        return pd.Series({"CabinDeck": np.nan, "CabinNum": np.nan, "CabinSide": np.nan})
    parts = str(x).split("/")
    deck = parts[0] if len(parts) > 0 else np.nan
    num  = pd.to_numeric(parts[1], errors="coerce") if len(parts) > 1 else np.nan
    side = parts[2] if len(parts) > 2 else np.nan
    return pd.Series({"CabinDeck": deck, "CabinNum": num, "CabinSide": side})

def prepare(df):
    df = df.copy()
    # Cabin parsed
    df = pd.concat([df, df["Cabin"].apply(_split_cabin)], axis=1)
    # Total spend
    spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df["TotalSpend"] = df[spend_cols].sum(axis=1, min_count=1)
    # Columns
    drop_cols = ["PassengerId","Name","Cabin","Transported"]
    feat_cols = [c for c in df.columns if c not in drop_cols]
    cat_cols  = [c for c in feat_cols if df[c].dtype == "object" or df[c].dtype == bool]
    return df, feat_cols, cat_cols

# Prep
train_prep, feat_cols, cat_cols = prepare(train)
test_prep,  _,        _        = prepare(test)

X = train_prep[feat_cols]
y = train["Transported"].astype(int)

cat_idx = [X.columns.get_loc(c) for c in cat_cols]

print("X shape:", X.shape, "| test shape:", test_prep[feat_cols].shape)
print("n_features:", len(feat_cols), "| n_categorical:", len(cat_cols))
print("first 3 feature columns:", feat_cols[:3])
X.head(3)


X shape: (8693, 14) | test shape: (4277, 14)
n_features: 14 | n_categorical: 6
first 3 feature columns: ['HomePlanet', 'CryoSleep', 'Destination']


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,CabinNum,CabinSide,TotalSpend
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,0.0,P,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,0.0,S,736.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,0.0,S,10383.0


In [2]:
# 5-fold Stratified CV with CatBoost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import numpy as np

# CatBoost needs categorical NaNs converted to strings
X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs, best_ns = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 1):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=100
    )

    model.fit(
        X_tr, y_tr,
        cat_features=cat_idx,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=200
    )

    pred_va = model.predict(X_va)
    acc = accuracy_score(y_va, pred_va)
    accs.append(acc)
    best_ns.append(model.get_best_iteration() or model.tree_count_)
    print(f"Fold {fold}: acc={acc:.5f}, best_iter={best_ns[-1]}")

print("\nCV Accuracy: mean={:.5f}, std={:.5f}".format(np.mean(accs), np.std(accs)))
print("Best iters per fold:", best_ns)


0:	learn: 0.7773943	test: 0.7734330	best: 0.7734330 (0)	total: 263ms	remaining: 8m 46s
100:	learn: 0.8402358	test: 0.8251869	best: 0.8251869 (100)	total: 13.3s	remaining: 4m 10s
200:	learn: 0.8845269	test: 0.8211616	best: 0.8257619 (102)	total: 26.7s	remaining: 3m 58s
300:	learn: 0.9105551	test: 0.8228867	best: 0.8257619 (102)	total: 40.2s	remaining: 3m 47s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8257619321
bestIteration = 102

Shrink model to first 103 iterations.
Fold 1: acc=0.82576, best_iter=102
0:	learn: 0.7824274	test: 0.7659574	best: 0.7659574 (0)	total: 123ms	remaining: 4m 5s
100:	learn: 0.8457003	test: 0.8056354	best: 0.8085106 (90)	total: 12.7s	remaining: 3m 59s
200:	learn: 0.8805004	test: 0.8050604	best: 0.8096607 (143)	total: 25.7s	remaining: 3m 49s
300:	learn: 0.9020708	test: 0.8050604	best: 0.8096607 (143)	total: 39.6s	remaining: 3m 43s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8096607246
bestIteration = 143

Shrink 

In [3]:
# Refit on ALL data using CV-informed n_estimators (median of best per fold)
import numpy as np
from catboost import CatBoostClassifier
import pandas as pd

best_n = int(np.median(best_ns))  # best_ns came from the CV cell
print("Using n_estimators:", best_n)

# Build full matrices
X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()

# Sanitize categoricals for CatBoost
for c in cat_cols:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

model_full_cv = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_cv.fit(X_all, y, cat_features=cat_idx_all)

proba = model_full_cv.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_cv = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_cv.to_csv("submission_catboost_cv.csv", index=False)
sub_cv.head()


Using n_estimators: 145


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [4]:
# Build group features from PassengerId and rebuild matrices
import pandas as pd
import numpy as np

# counts across train+test (safe: uses IDs only, no target)
all_ids = pd.concat([train["PassengerId"], test["PassengerId"]], ignore_index=True)
gid_counts = all_ids.str.split("_", n=1, expand=True)[0].value_counts()

def prepare2(df):
    df = df.copy()
    # Cabin parsed
    df = pd.concat([df, df["Cabin"].apply(_split_cabin)], axis=1)
    # Total spend
    spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df["TotalSpend"] = df[spend_cols].sum(axis=1, min_count=1)
    # Group features
    gid = df["PassengerId"].str.split("_", n=1, expand=True)[0]
    df["GroupID"] = gid
    df["GroupSize"] = gid.map(gid_counts).astype("Int64")
    df["IsAlone"] = (df["GroupSize"].fillna(1) == 1)
    # Feature lists
    drop_cols = ["PassengerId","Name","Cabin","Transported"]
    feat_cols = [c for c in df.columns if c not in drop_cols]
    cat_cols  = [c for c in feat_cols if df[c].dtype == "object" or df[c].dtype == bool]
    return df, feat_cols, cat_cols

# rebuild with new features
train_prep, feat_cols, cat_cols = prepare2(train)
test_prep,  _,        _        = prepare2(test)
X = train_prep[feat_cols]
y = train["Transported"].astype(int)

print("X shape:", X.shape, "| test shape:", test_prep[feat_cols].shape)
print("now including:", [c for c in ["GroupID","GroupSize","IsAlone"] if c in feat_cols])
X[["GroupID","GroupSize","IsAlone"]].head()


X shape: (8693, 17) | test shape: (4277, 17)
now including: ['GroupID', 'GroupSize', 'IsAlone']


Unnamed: 0,GroupID,GroupSize,IsAlone
0,1,1,True
1,2,1,True
2,3,2,False
3,3,2,False
4,4,1,True


In [5]:
# 5-fold CV with GroupID/GroupSize/IsAlone included
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import numpy as np

X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs, best_ns = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 1):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=100
    )
    model.fit(
        X_tr, y_tr,
        cat_features=cat_idx,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=200
    )

    pred_va = model.predict(X_va)
    accs.append(accuracy_score(y_va, pred_va))
    best_ns.append(model.get_best_iteration() or model.tree_count_)
    print(f"Fold {fold}: acc={accs[-1]:.5f}, best_iter={best_ns[-1]}")

print("\nCV Accuracy: mean={:.5f}, std={:.5f}".format(np.mean(accs), np.std(accs)))
print("Best iters per fold:", best_ns)


0:	learn: 0.7342537	test: 0.7441058	best: 0.7441058 (0)	total: 42.6ms	remaining: 1m 25s
100:	learn: 0.8455565	test: 0.8234618	best: 0.8234618 (94)	total: 13.9s	remaining: 4m 21s
200:	learn: 0.8901352	test: 0.8177113	best: 0.8234618 (94)	total: 28.1s	remaining: 4m 11s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8234617596
bestIteration = 94

Shrink model to first 95 iterations.
Fold 1: acc=0.82346, best_iter=94
0:	learn: 0.7496405	test: 0.7360552	best: 0.7360552 (0)	total: 128ms	remaining: 4m 16s
100:	learn: 0.8498706	test: 0.8050604	best: 0.8050604 (97)	total: 13.7s	remaining: 4m 16s
200:	learn: 0.8861087	test: 0.8079356	best: 0.8119609 (172)	total: 27.8s	remaining: 4m 9s
300:	learn: 0.9114179	test: 0.8073606	best: 0.8119609 (172)	total: 41.8s	remaining: 3m 55s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8119608971
bestIteration = 172

Shrink model to first 173 iterations.
Fold 2: acc=0.81196, best_iter=172
0:	learn: 0.7660339	test: 0.7

In [6]:
# Refit using median best iteration from CV and export submission
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

best_n = int(np.median(best_ns))  # e.g., from [94,172,201,255,255] -> 201
print("Using n_estimators:", best_n)

X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()

for c in cat_cols:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

model_full_groups = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_groups.fit(X_all, y, cat_features=cat_idx_all)

proba = model_full_groups.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_groups = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_groups.to_csv("submission_catboost_groups_cv.csv", index=False)
sub_groups.head()


Using n_estimators: 201


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [7]:
# Find best probability threshold via 5-fold CV (uses best_ns from last CV)
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

# 1) Build CV matrices with sanitized categoricals
X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

probas_val = np.zeros(len(X))
y_true = y.values.copy()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 0):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]
    n_estimators = best_ns[i] if 'best_ns' in globals() and len(best_ns)==5 else 200

    m = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=n_estimators,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=False
    )
    m.fit(X_tr, y_tr, cat_features=cat_idx)
    probas_val[va_idx] = m.predict_proba(X_va)[:, 1]

# 2) Grid search threshold for max accuracy
grid = np.linspace(0.35, 0.65, 61)  # 0.35..0.65 step 0.005
acc_05 = accuracy_score(y_true, (probas_val >= 0.5).astype(int))
best_t, best_acc = 0.5, acc_05
for t in grid:
    acc = accuracy_score(y_true, (probas_val >= t).astype(int))
    if acc > best_acc:
        best_t, best_acc = float(t), acc

print(f"Best threshold: {best_t:.3f}")
print(f"CV acc at best_t: {best_acc:.5f}")
print(f"CV acc at 0.500: {acc_05:.5f}")


Best threshold: 0.500
CV acc at best_t: 0.81353
CV acc at 0.500: 0.81353


In [8]:
# Add AgeBin, LogTotalSpend, AnySpend and rebuild features
import pandas as pd
import numpy as np

def prepare3(df):
    df = df.copy()
    # Cabin parsed
    df = pd.concat([df, df["Cabin"].apply(_split_cabin)], axis=1)
    # Spend features
    spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df["TotalSpend"] = df[spend_cols].sum(axis=1, min_count=1)
    df["LogTotalSpend"] = np.log1p(df["TotalSpend"])
    df["AnySpend"] = df["TotalSpend"].fillna(0) > 0
    # Group features (from earlier)
    gid_all = pd.concat([train["PassengerId"], test["PassengerId"]], ignore_index=True).str.split("_", n=1, expand=True)[0]
    gid_counts = gid_all.value_counts()
    gid = df["PassengerId"].str.split("_", n=1, expand=True)[0]
    df["GroupID"] = gid
    df["GroupSize"] = gid.map(gid_counts).astype("Int64")
    df["IsAlone"] = (df["GroupSize"].fillna(1) == 1)
    # Age bin (categorical)
    bins = [0, 12, 18, 25, 40, 60, 120]
    labels = ["child","teen","youngA","adult","mid","senior"]
    df["AgeBin"] = pd.cut(df["Age"], bins=bins, labels=labels, right=True, include_lowest=True)
    # Feature lists
    drop_cols = ["PassengerId","Name","Cabin","Transported"]
    feat_cols = [c for c in df.columns if c not in drop_cols]
    cat_cols  = [c for c in feat_cols if df[c].dtype == "object" or df[c].dtype == bool]
    return df, feat_cols, cat_cols

# rebuild
train_prep, feat_cols, cat_cols = prepare3(train)
test_prep,  _,        _        = prepare3(test)
X = train_prep[feat_cols]
y = train["Transported"].astype(int)

print("X shape:", X.shape, "| test shape:", test_prep[feat_cols].shape)
print("new features present:", [c for c in ["LogTotalSpend","AnySpend","AgeBin"] if c in feat_cols])


X shape: (8693, 20) | test shape: (4277, 20)
new features present: ['LogTotalSpend', 'AnySpend', 'AgeBin']


In [10]:
# compute cat_cols to include pandas 'category' dtype
def recompute_cat_cols(df, feature_columns):
    cats = []
    for c in feature_columns:
        dt = df[c].dtype
        if (dt == 'object') or (dt == bool) or (str(dt) == 'category'):
            cats.append(c)
    return cats

cat_cols = recompute_cat_cols(train_prep, feat_cols)
print("n_categorical:", len(cat_cols), "| includes AgeBin?", "AgeBin" in cat_cols)


n_categorical: 9 | includes AgeBin? True


In [11]:
# 5-fold CV after adding LogTotalSpend / AnySpend / AgeBin
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import numpy as np

X_cv = X.copy()
for c in cat_cols:
    X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs, best_ns = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_cv, y), 1):
    X_tr, X_va = X_cv.iloc[tr_idx], X_cv.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        depth=8,
        learning_rate=0.08,
        n_estimators=2000,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=100
    )
    model.fit(
        X_tr, y_tr,
        cat_features=cat_idx,
        eval_set=(X_va, y_va),
        use_best_model=True,
        early_stopping_rounds=200
    )

    pred_va = model.predict(X_va)
    accs.append(accuracy_score(y_va, pred_va))
    best_ns.append(model.get_best_iteration() or model.tree_count_)
    print(f"Fold {fold}: acc={accs[-1]:.5f}, best_iter={best_ns[-1]}")

print("\nCV Accuracy: mean={:.5f}, std={:.5f}".format(np.mean(accs), np.std(accs)))
print("Best iters per fold:", best_ns)


  X_cv[c] = X_cv[c].astype("object").fillna("Missing").astype(str)


0:	learn: 0.7349727	test: 0.7458309	best: 0.7458309 (0)	total: 121ms	remaining: 4m 2s
100:	learn: 0.8458441	test: 0.8165612	best: 0.8228867 (75)	total: 13.7s	remaining: 4m 16s
200:	learn: 0.8886972	test: 0.8205865	best: 0.8246118 (169)	total: 28.2s	remaining: 4m 12s
300:	learn: 0.9137187	test: 0.8177113	best: 0.8251869 (244)	total: 42.8s	remaining: 4m 1s
400:	learn: 0.9318378	test: 0.8234618	best: 0.8251869 (244)	total: 57.2s	remaining: 3m 48s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.825186889
bestIteration = 244

Shrink model to first 245 iterations.
Fold 1: acc=0.82519, best_iter=244
0:	learn: 0.7382801	test: 0.7326049	best: 0.7326049 (0)	total: 76ms	remaining: 2m 31s
100:	learn: 0.8518838	test: 0.8033353	best: 0.8079356 (66)	total: 14.4s	remaining: 4m 29s
200:	learn: 0.8912856	test: 0.8056354	best: 0.8079356 (66)	total: 29s	remaining: 4m 19s
300:	learn: 0.9108427	test: 0.8073606	best: 0.8102358 (240)	total: 43.7s	remaining: 4m 6s
400:	learn: 0.9292494	tes

In [12]:
# Refit on ALL data using median(best iters) from the last CV and save submission
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

best_n = int(np.median(best_ns))  # uses the best_ns from your last CV cell
print("Using n_estimators:", best_n)

X_all  = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()

# Sanitize categoricals
for c in cat_cols:
    X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

model_full_bins = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full_bins.fit(X_all, y, cat_features=cat_idx_all)

proba = model_full_bins.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

sub_bins = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})
sub_bins.to_csv("submission_catboost_bins_groups.csv", index=False)
sub_bins.head()


Using n_estimators: 340


  X_all[c]  = X_all[c].astype("object").fillna("Missing").astype(str)
  X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
