In [1]:
import pandas as pd
import numpy as np

# widen display a bit
pd.set_option("display.max_columns", 50)

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

print("train shape:", train.shape, "| test shape:", test.shape)
print("columns:", train.columns.tolist())
train.head(5)


train shape: (8693, 14) | test shape: (4277, 13)
columns: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported']


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [2]:
# Target balance and top missing columns
print("Transported counts:")
print(train["Transported"].value_counts(dropna=False))
print("\nTransported class balance (%):")
print((train["Transported"].value_counts(normalize=True) * 100).round(2))

miss = (train.isna().mean() * 100).sort_values(ascending=False)
print("\nColumns with missing values (%):")
display(miss[miss > 0].head(12))


Transported counts:
Transported
True     4378
False    4315
Name: count, dtype: int64

Transported class balance (%):
Transported
True     50.36
False    49.64
Name: proportion, dtype: float64

Columns with missing values (%):


CryoSleep       2.496261
ShoppingMall    2.392730
VIP             2.335212
HomePlanet      2.312205
Name            2.300702
Cabin           2.289198
VRDeck          2.162660
Spa             2.105142
FoodCourt       2.105142
Destination     2.093639
RoomService     2.082135
Age             2.059128
dtype: float64

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def _split_cabin(x):
    if pd.isna(x):
        return pd.Series({"CabinDeck": np.nan, "CabinNum": np.nan, "CabinSide": np.nan})
    parts = str(x).split("/")
    deck = parts[0] if len(parts) > 0 else np.nan
    num  = pd.to_numeric(parts[1], errors="coerce") if len(parts) > 1 else np.nan
    side = parts[2] if len(parts) > 2 else np.nan
    return pd.Series({"CabinDeck": deck, "CabinNum": num, "CabinSide": side})

def prepare(df):
    df = df.copy()
    # Cabin features
    df = pd.concat([df, df["Cabin"].apply(_split_cabin)], axis=1)
    # Total spend
    spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df["TotalSpend"] = df[spend_cols].sum(axis=1, min_count=1)
    # Feature lists
    drop_cols = ["PassengerId","Name","Cabin","Transported"]
    feat_cols = [c for c in df.columns if c not in drop_cols]
    cat_cols  = [c for c in feat_cols if df[c].dtype == "object" or df[c].dtype == bool]
    return df, feat_cols, cat_cols

train_prep, feat_cols, cat_cols = prepare(train)
test_prep,  _,        _        = prepare(test)

print("n_features:", len(feat_cols))
print("categorical:", cat_cols)

X = train_prep[feat_cols]
y = train["Transported"].astype(int)  # 0/1 for classifier
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)
print("train/valid shapes:", X_train.shape, X_valid.shape)


n_features: 14
categorical: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck', 'CabinSide']
train/valid shapes: (6954, 14) (1739, 14)


In [5]:
# CatBoost cannot handle NaN in categorical features — convert NaN to a token and cast to string
for c in cat_cols:
    X_train[c] = X_train[c].astype("object").fillna("Missing").astype(str)
    X_valid[c] = X_valid[c].astype("object").fillna("Missing").astype(str)


In [6]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# CatBoost needs the categorical feature indices
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=2000,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=100
)

model.fit(
    X_train, y_train,
    cat_features=cat_idx,
    eval_set=(X_valid, y_valid),
    use_best_model=True,
    early_stopping_rounds=200
)

pred_valid = model.predict(X_valid)
acc = accuracy_score(y_valid, pred_valid)
print(f"Validation accuracy: {acc:.5f}")


0:	learn: 0.7805580	test: 0.7872340	best: 0.7872340 (0)	total: 352ms	remaining: 11m 44s
100:	learn: 0.8383664	test: 0.8056354	best: 0.8062105 (98)	total: 12.1s	remaining: 3m 47s
200:	learn: 0.8833765	test: 0.8113859	best: 0.8125359 (182)	total: 24.9s	remaining: 3m 42s
300:	learn: 0.9101237	test: 0.8217366	best: 0.8217366 (299)	total: 38.4s	remaining: 3m 36s
400:	learn: 0.9314064	test: 0.8177113	best: 0.8217366 (299)	total: 51.6s	remaining: 3m 25s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8217366302
bestIteration = 299

Shrink model to first 300 iterations.
Validation accuracy: 0.82174


In [7]:
# Refit on ALL training data with the best iteration and create a submission
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd

# 1) Full matrices
X_all = train_prep[feat_cols].copy()
X_test = test_prep[feat_cols].copy()

# 2) Sanitize categoricals for CatBoost
for c in cat_cols:
    X_all[c] = X_all[c].astype("object").fillna("Missing").astype(str)
    X_test[c] = X_test[c].astype("object").fillna("Missing").astype(str)

cat_idx_all = [X_all.columns.get_loc(c) for c in cat_cols]

# 3) Use best iteration discovered earlier
best_n = model.get_best_iteration()
if best_n is None or best_n <= 0:
    best_n = model.tree_count_

# 4) Refit on ALL training data
model_full = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    depth=8,
    learning_rate=0.08,
    n_estimators=best_n,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)
model_full.fit(X_all, y, cat_features=cat_idx_all)

# 5) Predict test and save submission
proba = model_full.predict_proba(X_test)[:, 1]
pred_bool = proba >= 0.5

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred_bool
})

submission.to_csv("submission_catboost_baseline.csv", index=False)
submission.head()


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
