In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import numpy as np
import optuna
import logging
import json
import os
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from category_encoders import TargetEncoder

# Configure logging
os.makedirs("..data", exist_ok=True)
logging.basicConfig(filename='../data/log_file.log', level=logging.INFO)

In [2]:
# Load training data
df = pd.read_csv("../data/train.csv")

# Drop unnecessary columns
df.drop(["PassengerId", "Name"], axis=1, inplace=True, errors="ignore")

# Define numerical and categorical columns
numerical_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
categorical_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]

# Handle missing values
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Feature Engineering: Split "Cabin" into "Deck", "CabinNumber", "Side"
df[["Deck", "CabinNumber", "Side"]] = df["Cabin"].str.split("/", expand=True)
df.drop("Cabin", axis=1, inplace=True)
df["CabinNumber"] = pd.to_numeric(df["CabinNumber"], errors="coerce").fillna(0)

# Update categorical columns list
categorical_cols.extend(["Deck", "Side"])

# Target Encoding
target_encoding_mappings = {col: df.groupby(col)["Transported"].mean() for col in categorical_cols}
for col in categorical_cols:
    df[f"{col}_encoded"] = df[col].map(target_encoding_mappings[col])
df.drop(categorical_cols, axis=1, inplace=True)

# Feature Engineering
df["TotalSpending"] = df[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
df["Age_Spending"] = df["Age"] * df["TotalSpending"]
df["High_Cabin"] = (df["CabinNumber"] > df["CabinNumber"].median()).astype(int)
df.drop("CabinNumber", axis=1, inplace=True)

# Log Transformations for Spending Columns
spending_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
for col in spending_cols:
    df[f"{col}_log"] = np.log1p(df[col])
df.drop(spending_cols, axis=1, inplace=True)

# Train-Test Split
X = df.drop("Transported", axis=1)
y = df["Transported"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Data Preprocessing Completed!")

Training Data Preprocessing Completed!


In [3]:
def objective(trial):
    # Hyperparameters to tune
    params = {
        "iterations": trial.suggest_int("iterations", 200, 1000),
        "depth": trial.suggest_int("depth", 4, 9),
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.2, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 5, log=True),
        "border_count": trial.suggest_int("border_count", 64, 255),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 5, log=True),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
        "loss_function": "Logloss",
        "verbose": False,
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 10, 50),  # Early stopping to prevent overfitting
    }

    # Prepare the training pool for CatBoost
    train_pool = Pool(X_train, label=y_train)

    # Run cross-validation with early stopping and all available threads
    cv_results = cv(
        params=params,
        pool=train_pool,
        fold_count=5,  # 5-fold cross-validation
        shuffle=True,
        partition_random_seed=42,
        stratified=True,
        early_stopping_rounds=params["early_stopping_rounds"],
        verbose_eval=False,
    )

    # Extract the best test metric (Logloss)
    best_metric = [col for col in cv_results.columns if "test" in col and "mean" in col][0]  # Select the correct column
    return cv_results[best_metric].min()  # We want to minimize Logloss, so return the minimum value

logging.info("Starting Optuna optimization...")
# Run Optuna with parallel trials
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50, n_jobs=-1)
best_params = study.best_params
with open("../mlops_hw1/best_params.json", "w") as f:
    json.dump(best_params, f)

logging.info(f"Optimization complete. Best Parameters: {best_params}")
print("Best Parameters:", study.best_params)

[I 2025-03-20 17:13:08,043] A new study created in memory with name: no-name-5d2ee80d-56cd-456d-a69b-11515f84fa07


Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]
Training on fold [0/5]

bestTest = 0.4085607518
bestIteration = 32

Training on fold [1/5]

bestTest = 0.4014642329
bestIteration = 64

Training on fold [1/5]

bestTest = 0.4084762782
bestIteration = 97

Training on fold [1/5]

bestTest = 0.407055551
bestIteration = 96


bestTest = 0.4051441918
bestIteration = 134

Training on fold [1/5]
Training on fold [1/5]

bestTest = 0.4193942336
bestIteration = 36

Training on fold [2/5]

bestTest = 0.4045947748
bestIteration = 136

Training on fold [1/5]

bestTest = 0.4125388192
bestIteration = 60

Training on fold [1/5]

bestTest = 0.4105509378
bestIteration = 73

Training on fold [2/5]

bestTest = 0.38169

[I 2025-03-20 17:13:18,596] Trial 1 finished with value: 0.4123901523621859 and parameters: {'iterations': 700, 'depth': 7, 'learning_rate': 0.14812157065843756, 'l2_leaf_reg': 0.06715653808550483, 'border_count': 229, 'random_strength': 0.05301032495033394, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 27}. Best is trial 1 with value: 0.4123901523621859.



bestTest = 0.4251995016
bestIteration = 48

Training on fold [0/5]

bestTest = 0.3660728572
bestIteration = 130

Training on fold [3/5]

bestTest = 0.4094503536
bestIteration = 123

Training on fold [2/5]

bestTest = 0.4067061105
bestIteration = 77

Training on fold [4/5]

bestTest = 0.3679129703
bestIteration = 176


bestTest = 0.3804859453
bestIteration = 112

Training on fold [3/5]
Training on fold [3/5]

bestTest = 0.4256060906
bestIteration = 111

Training on fold [2/5]

bestTest = 0.4058628461
bestIteration = 73

Training on fold [4/5]

bestTest = 0.404383052
bestIteration = 211

Training on fold [1/5]


[I 2025-03-20 17:13:21,674] Trial 9 finished with value: 0.40258535505719617 and parameters: {'iterations': 425, 'depth': 4, 'learning_rate': 0.14193644532462843, 'l2_leaf_reg': 3.126945377787767, 'border_count': 77, 'random_strength': 0.06128562873814301, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 17}. Best is trial 9 with value: 0.40258535505719617.



bestTest = 0.4019956517
bestIteration = 99


bestTest = 0.4174286467
bestIteration = 76

Training on fold [4/5]
Training on fold [0/5]


[I 2025-03-20 17:13:22,501] Trial 2 finished with value: 0.407410131171535 and parameters: {'iterations': 775, 'depth': 7, 'learning_rate': 0.07648831971952472, 'l2_leaf_reg': 0.007617041822299834, 'border_count': 131, 'random_strength': 0.05276886167346704, 'bootstrap_type': 'MVS', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 11}. Best is trial 9 with value: 0.40258535505719617.



bestTest = 0.4240110118
bestIteration = 62

Training on fold [0/5]

bestTest = 0.4023429163
bestIteration = 136

Training on fold [4/5]

bestTest = 0.4094045743
bestIteration = 351

Training on fold [2/5]

bestTest = 0.4065037708
bestIteration = 54

Training on fold [1/5]

bestTest = 0.4015898315
bestIteration = 168

Training on fold [4/5]


[I 2025-03-20 17:13:24,781] Trial 13 finished with value: 0.4007431130814313 and parameters: {'iterations': 430, 'depth': 4, 'learning_rate': 0.11459518111888063, 'l2_leaf_reg': 1.888762848595786, 'border_count': 148, 'random_strength': 0.8012234483350534, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 18}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4173462054
bestIteration = 105

Training on fold [0/5]


[I 2025-03-20 17:13:26,242] Trial 11 finished with value: 0.40519171129639603 and parameters: {'iterations': 294, 'depth': 5, 'learning_rate': 0.12985166162407938, 'l2_leaf_reg': 0.0475123638881375, 'border_count': 90, 'random_strength': 0.007576660097431133, 'bootstrap_type': 'Bayesian', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 36}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4054310517
bestIteration = 54

Training on fold [1/5]

bestTest = 0.4285050717
bestIteration = 90

Training on fold [0/5]

bestTest = 0.4218792778
bestIteration = 43

Training on fold [1/5]

bestTest = 0.3807364881
bestIteration = 134

Training on fold [3/5]

bestTest = 0.3703917879
bestIteration = 136

Training on fold [3/5]

bestTest = 0.4112271451
bestIteration = 448

Training on fold [2/5]

bestTest = 0.4015614327
bestIteration = 154

Training on fold [2/5]


[I 2025-03-20 17:13:28,065] Trial 14 finished with value: 0.4025477873526627 and parameters: {'iterations': 386, 'depth': 6, 'learning_rate': 0.0658295667293329, 'l2_leaf_reg': 0.9041134042275795, 'border_count': 215, 'random_strength': 0.23817782530574633, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 10}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4164893556
bestIteration = 146

Training on fold [0/5]

bestTest = 0.413231319
bestIteration = 34

Training on fold [1/5]

bestTest = 0.4067672569
bestIteration = 52

Training on fold [1/5]

bestTest = 0.4067991446
bestIteration = 493

Training on fold [1/5]

bestTest = 0.4060001406
bestIteration = 197

Training on fold [1/5]

bestTest = 0.4169164823
bestIteration = 124

Training on fold [2/5]

bestTest = 0.4062455049
bestIteration = 57

Training on fold [2/5]

bestTest = 0.371406865
bestIteration = 338

Training on fold [3/5]

bestTest = 0.422623858
bestIteration = 66

Training on fold [2/5]

bestTest = 0.4078556803
bestIteration = 98

Training on fold [4/5]

bestTest = 0.4045119903
bestIteration = 128

Training on fold [4/5]

bestTest = 0.4210608796
bestIteration = 52

Training on fold [2/5]

bestTest = 0.4066730427
bestIteration = 268

Training on fold [1/5]

bestTest = 0.3984844438
bestIteration = 459

Training on fold [1/5]

bestTest = 0.3710970002
bestIteration = 10

[I 2025-03-20 17:13:39,333] Trial 10 finished with value: 0.4107829159307876 and parameters: {'iterations': 883, 'depth': 9, 'learning_rate': 0.030602343116585037, 'l2_leaf_reg': 0.03483438490033142, 'border_count': 77, 'random_strength': 0.05907028067876343, 'bootstrap_type': 'Bayesian', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 39}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4296415601
bestIteration = 137

Training on fold [0/5]

bestTest = 0.3846425992
bestIteration = 56

Training on fold [3/5]

bestTest = 0.3990367215
bestIteration = 351

Training on fold [4/5]

bestTest = 0.4105467554
bestIteration = 286

Training on fold [2/5]

bestTest = 0.3725270187
bestIteration = 212

Training on fold [3/5]


[I 2025-03-20 17:13:41,646] Trial 15 finished with value: 0.40408957031377557 and parameters: {'iterations': 319, 'depth': 8, 'learning_rate': 0.08159383888114964, 'l2_leaf_reg': 0.13721839993151522, 'border_count': 89, 'random_strength': 1.366036248621742, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 14}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4167056854
bestIteration = 151


bestTest = 0.4138277357
bestIteration = 327

Training on fold [2/5]
Training on fold [0/5]

bestTest = 0.3972646567
bestIteration = 88

Training on fold [4/5]

bestTest = 0.3996279291
bestIteration = 257

Training on fold [1/5]

bestTest = 0.3986372447
bestIteration = 165

Training on fold [1/5]

bestTest = 0.4073807087
bestIteration = 37

Training on fold [1/5]

bestTest = 0.424706737
bestIteration = 25

Training on fold [2/5]

bestTest = 0.4142428372
bestIteration = 49

Training on fold [4/5]

bestTest = 0.3731613676
bestIteration = 225

Training on fold [3/5]

bestTest = 0.3765266476
bestIteration = 52

Training on fold [3/5]


[I 2025-03-20 17:13:46,305] Trial 18 finished with value: 0.40190685227329387 and parameters: {'iterations': 378, 'depth': 9, 'learning_rate': 0.11041721342349768, 'l2_leaf_reg': 2.1482991527967688, 'border_count': 157, 'random_strength': 0.026665390403587494, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 24}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4024293287
bestIteration = 169

Training on fold [4/5]

bestTest = 0.3750055797
bestIteration = 182


bestTest = 0.4205117552
bestIteration = 45

Training on fold [3/5]
Training on fold [0/5]

bestTest = 0.3818690975
bestIteration = 267

Training on fold [3/5]

bestTest = 0.4102349713
bestIteration = 141

Training on fold [2/5]

bestTest = 0.4035167498
bestIteration = 337


bestTest = 0.4085624748
bestIteration = 491

Training on fold [4/5]
Training on fold [2/5]

bestTest = 0.4153016496
bestIteration = 49

Training on fold [4/5]

bestTest = 0.4069586597
bestIteration = 54

Training on fold [4/5]


[I 2025-03-20 17:13:48,586] Trial 5 finished with value: 0.4009711707068588 and parameters: {'iterations': 352, 'depth': 5, 'learning_rate': 0.047157751458553575, 'l2_leaf_reg': 0.4344587142324955, 'border_count': 114, 'random_strength': 0.5638703653589995, 'bootstrap_type': 'Bayesian', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 21}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4194982608
bestIteration = 328

Training on fold [0/5]


[I 2025-03-20 17:13:49,328] Trial 23 finished with value: 0.40854540733499134 and parameters: {'iterations': 806, 'depth': 8, 'learning_rate': 0.1399027422909522, 'l2_leaf_reg': 0.982152568161932, 'border_count': 249, 'random_strength': 0.006015759746618973, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 11}. Best is trial 13 with value: 0.4007431130814313.
[I 2025-03-20 17:13:49,382] Trial 19 finished with value: 0.4125899573654543 and parameters: {'iterations': 561, 'depth': 8, 'learning_rate': 0.10811514287674268, 'l2_leaf_reg': 0.005132929090819179, 'border_count': 236, 'random_strength': 0.0733852765111541, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 46}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4343906723
bestIteration = 68

Training on fold [2/5]

bestTest = 0.4219082731
bestIteration = 35


bestTest = 0.42780429
bestIteration = 43

Training on fold [0/5]
Training on fold [0/5]

bestTest = 0.4180657318
bestIteration = 60

Training on fold [2/5]

bestTest = 0.4143519839
bestIteration = 68

Training on fold [1/5]

bestTest = 0.4008801255
bestIteration = 147

Training on fold [4/5]

bestTest = 0.399453384
bestIteration = 165

Training on fold [4/5]

bestTest = 0.4071912484
bestIteration = 283

Training on fold [4/5]

bestTest = 0.4189725263
bestIteration = 61

Training on fold [2/5]

bestTest = 0.3680555142
bestIteration = 182

Training on fold [3/5]


[I 2025-03-20 17:13:53,101] Trial 4 finished with value: 0.4037182678677172 and parameters: {'iterations': 758, 'depth': 4, 'learning_rate': 0.03187115685826606, 'l2_leaf_reg': 4.363106821780925, 'border_count': 205, 'random_strength': 0.018057164231544572, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 12}. Best is trial 13 with value: 0.4007431130814313.



bestTest = 0.4221780078
bestIteration = 218

Training on fold [0/5]

bestTest = 0.4232325559
bestIteration = 204


bestTest = 0.3997500288
bestIteration = 273

Training on fold [1/5]
Training on fold [4/5]

bestTest = 0.4000269468
bestIteration = 159

Training on fold [1/5]


[I 2025-03-20 17:13:54,175] Trial 6 finished with value: 0.4005758807087821 and parameters: {'iterations': 318, 'depth': 7, 'learning_rate': 0.09592704186515337, 'l2_leaf_reg': 2.9438720562992997, 'border_count': 181, 'random_strength': 1.5568016356796235, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 35}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4194145283
bestIteration = 116

Training on fold [0/5]

bestTest = 0.3846985028
bestIteration = 81

Training on fold [3/5]


[I 2025-03-20 17:13:55,752] Trial 16 finished with value: 0.411918199278911 and parameters: {'iterations': 603, 'depth': 7, 'learning_rate': 0.03989817989801063, 'l2_leaf_reg': 0.007400596969617585, 'border_count': 231, 'random_strength': 0.014319606986740693, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 33}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4254631202
bestIteration = 69

Training on fold [0/5]

bestTest = 0.4031810619
bestIteration = 93

Training on fold [4/5]

bestTest = 0.4335644641
bestIteration = 204

Training on fold [2/5]

bestTest = 0.4028721543
bestIteration = 387

Training on fold [1/5]


[I 2025-03-20 17:13:58,121] Trial 17 finished with value: 0.40298540711941 and parameters: {'iterations': 540, 'depth': 5, 'learning_rate': 0.05064866820560538, 'l2_leaf_reg': 0.06172235087656636, 'border_count': 203, 'random_strength': 0.00638756650042611, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 41}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4182259778
bestIteration = 151

Training on fold [0/5]

bestTest = 0.4117140904
bestIteration = 91

Training on fold [4/5]


[I 2025-03-20 17:13:59,059] Trial 21 finished with value: 0.4022530245481889 and parameters: {'iterations': 955, 'depth': 4, 'learning_rate': 0.06083490400500798, 'l2_leaf_reg': 1.9648325422069643, 'border_count': 186, 'random_strength': 0.0011741561213153163, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 21}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4176998537
bestIteration = 251

Training on fold [0/5]

bestTest = 0.4131198144
bestIteration = 48

Training on fold [1/5]


[I 2025-03-20 17:13:59,584] Trial 7 finished with value: 0.40368976697832626 and parameters: {'iterations': 746, 'depth': 6, 'learning_rate': 0.037929151319309753, 'l2_leaf_reg': 0.011248123737321081, 'border_count': 97, 'random_strength': 0.008904066410234395, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 39}. Best is trial 6 with value: 0.4005758807087821.
[I 2025-03-20 17:13:59,616] Trial 22 finished with value: 0.40299103210657955 and parameters: {'iterations': 406, 'depth': 6, 'learning_rate': 0.07212435786846565, 'l2_leaf_reg': 0.014918282151878834, 'border_count': 201, 'random_strength': 0.03298988122590917, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 42}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4154097429
bestIteration = 145


bestTest = 0.4263636318
bestIteration = 115

Training on fold [0/5]

bestTest = 0.4129174764
bestIteration = 129

Training on fold [2/5]
Training on fold [0/5]

bestTest = 0.4035353657
bestIteration = 248

Training on fold [1/5]

bestTest = 0.4427269905
bestIteration = 48

Training on fold [1/5]


[I 2025-03-20 17:14:01,161] Trial 20 finished with value: 0.40400578436773005 and parameters: {'iterations': 965, 'depth': 4, 'learning_rate': 0.03766226233779883, 'l2_leaf_reg': 0.03396830024901833, 'border_count': 251, 'random_strength': 0.0031121996952896943, 'bootstrap_type': 'MVS', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 13}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4175196062
bestIteration = 417



[I 2025-03-20 17:14:01,639] Trial 25 finished with value: 0.4149807481551848 and parameters: {'iterations': 543, 'depth': 4, 'learning_rate': 0.18603556248652967, 'l2_leaf_reg': 0.0019693760257262975, 'border_count': 174, 'random_strength': 4.998917234352786, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 48}. Best is trial 6 with value: 0.4005758807087821.


Training on fold [0/5]

bestTest = 0.4319726806
bestIteration = 92

Training on fold [0/5]

bestTest = 0.4044663313
bestIteration = 63

Training on fold [1/5]

bestTest = 0.4022526053
bestIteration = 204

Training on fold [3/5]

bestTest = 0.4064773986
bestIteration = 215

Training on fold [1/5]

bestTest = 0.4183187748
bestIteration = 65

Training on fold [1/5]

bestTest = 0.424745911
bestIteration = 54

Training on fold [2/5]

bestTest = 0.4290269668
bestIteration = 56

Training on fold [2/5]

bestTest = 0.4297502179
bestIteration = 212


bestTest = 0.4125961629
bestIteration = 46

Training on fold [1/5]
Training on fold [2/5]

bestTest = 0.4038363531
bestIteration = 137

Training on fold [1/5]

bestTest = 0.4202339148
bestIteration = 70

Training on fold [2/5]

bestTest = 0.371717231
bestIteration = 494

Training on fold [3/5]

bestTest = 0.4287182437
bestIteration = 204

Training on fold [4/5]

bestTest = 0.4094376273
bestIteration = 237

Training on fold [2/5]

bestTest = 0.412765

[I 2025-03-20 17:14:11,527] Trial 27 finished with value: 0.424580095492726 and parameters: {'iterations': 205, 'depth': 4, 'learning_rate': 0.03820196363091829, 'l2_leaf_reg': 0.2556583323066677, 'border_count': 126, 'random_strength': 4.430223428880904, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 21}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4351326084
bestIteration = 204


bestTest = 0.4137780911
bestIteration = 57

Training on fold [4/5]
Training on fold [0/5]

bestTest = 0.4363571757
bestIteration = 212

Training on fold [2/5]

bestTest = 0.4112905233
bestIteration = 216

Training on fold [2/5]

bestTest = 0.3774387772
bestIteration = 63

Training on fold [3/5]

bestTest = 0.412249357
bestIteration = 69

Training on fold [4/5]

bestTest = 0.407922838
bestIteration = 124

Training on fold [2/5]

bestTest = 0.3839574625
bestIteration = 79

Training on fold [3/5]

bestTest = 0.4041142802
bestIteration = 68

Training on fold [4/5]

bestTest = 0.406138841
bestIteration = 245


bestTest = 0.407501066
bestIteration = 73

Training on fold [4/5]
Training on fold [2/5]

bestTest = 0.4062805826
bestIteration = 94

Training on fold [1/5]


[I 2025-03-20 17:14:15,897] Trial 31 finished with value: 0.4225893353285904 and parameters: {'iterations': 213, 'depth': 6, 'learning_rate': 0.19856605426754945, 'l2_leaf_reg': 0.0014534061754860653, 'border_count': 171, 'random_strength': 4.171537401501354, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 29}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4363257716
bestIteration = 54


bestTest = 0.4305059737
bestIteration = 58



[I 2025-03-20 17:14:16,077] Trial 35 finished with value: 0.41454347712895023 and parameters: {'iterations': 206, 'depth': 5, 'learning_rate': 0.1746903281750491, 'l2_leaf_reg': 0.0019071265705229196, 'border_count': 168, 'random_strength': 4.740669633832034, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 33}. Best is trial 6 with value: 0.4005758807087821.


Training on fold [0/5]
Training on fold [0/5]

bestTest = 0.4195628188
bestIteration = 229

Training on fold [2/5]

bestTest = 0.3999818052
bestIteration = 162

Training on fold [4/5]

bestTest = 0.4078917373
bestIteration = 80


bestTest = 0.3708577799
bestIteration = 297

Training on fold [1/5]
Training on fold [3/5]


[I 2025-03-20 17:14:19,271] Trial 32 finished with value: 0.40887942251358816 and parameters: {'iterations': 201, 'depth': 6, 'learning_rate': 0.189868676640034, 'l2_leaf_reg': 0.258392626398891, 'border_count': 170, 'random_strength': 4.3484567339713385, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 30}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.400342923
bestIteration = 212

Training on fold [3/5]

bestTest = 0.4282684131
bestIteration = 55

Training on fold [0/5]

bestTest = 0.4146772644
bestIteration = 103

Training on fold [2/5]


[I 2025-03-20 17:14:20,598] Trial 30 finished with value: 0.41259256156592805 and parameters: {'iterations': 212, 'depth': 6, 'learning_rate': 0.19064264880903262, 'l2_leaf_reg': 0.26118763831663855, 'border_count': 183, 'random_strength': 4.439019110421854, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 41}. Best is trial 6 with value: 0.4005758807087821.



bestTest = 0.4196172494
bestIteration = 73


bestTest = 0.3742029635
bestIteration = 524

Training on fold [0/5]
Training on fold [3/5]

bestTest = 0.3716698092
bestIteration = 128

Training on fold [3/5]

bestTest = 0.3753013818
bestIteration = 216

Training on fold [3/5]

bestTest = 0.4083903571
bestIteration = 102

Training on fold [2/5]

bestTest = 0.4092749541
bestIteration = 158

Training on fold [1/5]

bestTest = 0.3828778959
bestIteration = 229

Training on fold [3/5]

bestTest = 0.400794267
bestIteration = 137

Training on fold [1/5]


[I 2025-03-20 17:14:27,505] Trial 24 finished with value: 0.397811063928694 and parameters: {'iterations': 864, 'depth': 7, 'learning_rate': 0.06889471573522549, 'l2_leaf_reg': 0.37265269690298647, 'border_count': 121, 'random_strength': 1.9939575799005238, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 11}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.4111909285
bestIteration = 178

Training on fold [0/5]

bestTest = 0.3701642609
bestIteration = 146

Training on fold [3/5]

bestTest = 0.427217535
bestIteration = 212

Training on fold [4/5]

bestTest = 0.3700839693
bestIteration = 92

Training on fold [3/5]

bestTest = 0.4003286693
bestIteration = 484


bestTest = 0.4032543355
bestIteration = 229

Training on fold [4/5]
Training on fold [4/5]

bestTest = 0.3996662459
bestIteration = 143

Training on fold [4/5]

bestTest = 0.4048335288
bestIteration = 79

Training on fold [4/5]

bestTest = 0.4087362439
bestIteration = 135

Training on fold [2/5]

bestTest = 0.4034465133
bestIteration = 101

Training on fold [4/5]

bestTest = 0.4134096692
bestIteration = 101

Training on fold [2/5]

bestTest = 0.4004745388
bestIteration = 111

Training on fold [1/5]

bestTest = 0.4040613192
bestIteration = 216

Training on fold [4/5]

bestTest = 0.3982462375
bestIteration = 465


bestTest = 0.4085060301
bestIteration = 229

Training on fo

[I 2025-03-20 17:14:35,625] Trial 34 finished with value: 0.4266172730010105 and parameters: {'iterations': 213, 'depth': 5, 'learning_rate': 0.023510319541851215, 'l2_leaf_reg': 0.001263225968729408, 'border_count': 166, 'random_strength': 4.861698606745094, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 29}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.4393885903
bestIteration = 211

Training on fold [0/5]

bestTest = 0.4221809089
bestIteration = 55

Training on fold [4/5]


[I 2025-03-20 17:14:37,244] Trial 38 finished with value: 0.4026761683030914 and parameters: {'iterations': 453, 'depth': 5, 'learning_rate': 0.09666135675569737, 'l2_leaf_reg': 0.43629710643735126, 'border_count': 140, 'random_strength': 0.6043110612794358, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 17}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.4178486246
bestIteration = 132



[I 2025-03-20 17:14:37,586] Trial 33 finished with value: 0.40216090426877094 and parameters: {'iterations': 225, 'depth': 6, 'learning_rate': 0.0941741208134333, 'l2_leaf_reg': 0.21027556390646426, 'border_count': 163, 'random_strength': 4.204196693163761, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 30}. Best is trial 24 with value: 0.397811063928694.


Training on fold [0/5]

bestTest = 0.4204503045
bestIteration = 118

Training on fold [0/5]

bestTest = 0.365911569
bestIteration = 135

Training on fold [3/5]

bestTest = 0.4063390437
bestIteration = 72

Training on fold [1/5]

bestTest = 0.4123767956
bestIteration = 104

Training on fold [2/5]


[I 2025-03-20 17:14:40,132] Trial 37 finished with value: 0.4028635064511688 and parameters: {'iterations': 204, 'depth': 5, 'learning_rate': 0.0925343225350629, 'l2_leaf_reg': 0.4323830863139545, 'border_count': 147, 'random_strength': 0.61366287773347, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 30}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.3657345889
bestIteration = 131


bestTest = 0.4108334737
bestIteration = 190

Training on fold [3/5]

bestTest = 0.4007981252
bestIteration = 386

Training on fold [0/5]
Training on fold [1/5]


[I 2025-03-20 17:14:41,433] Trial 28 finished with value: 0.4001957035497222 and parameters: {'iterations': 533, 'depth': 5, 'learning_rate': 0.05903847440846082, 'l2_leaf_reg': 0.6009141187657571, 'border_count': 120, 'random_strength': 4.3870872736422655, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 21}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.411495802
bestIteration = 337

Training on fold [0/5]


[I 2025-03-20 17:14:42,121] Trial 36 finished with value: 0.40984866022620636 and parameters: {'iterations': 230, 'depth': 5, 'learning_rate': 0.02083384316268622, 'l2_leaf_reg': 0.37053161158596404, 'border_count': 133, 'random_strength': 0.7171520752908315, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 29}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.4267850885
bestIteration = 227

Training on fold [0/5]


[I 2025-03-20 17:14:43,823] Trial 29 finished with value: 0.4028030458846075 and parameters: {'iterations': 217, 'depth': 6, 'learning_rate': 0.0496612411268966, 'l2_leaf_reg': 0.271556738457855, 'border_count': 178, 'random_strength': 4.7520931200982295, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 32}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.4167211262
bestIteration = 215

Training on fold [0/5]

bestTest = 0.4114262682
bestIteration = 97

Training on fold [2/5]


[I 2025-03-20 17:14:45,979] Trial 26 finished with value: 0.4012379682349663 and parameters: {'iterations': 546, 'depth': 4, 'learning_rate': 0.04008175647359076, 'l2_leaf_reg': 0.3501783801136771, 'border_count': 150, 'random_strength': 2.7369254079797507, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 21}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.3952615989
bestIteration = 143

Training on fold [4/5]

bestTest = 0.4156809585
bestIteration = 446


bestTest = 0.3958940524
bestIteration = 161

Training on fold [1/5]

bestTest = 0.3637232531
bestIteration = 132

Training on fold [3/5]

bestTest = 0.3953270837
bestIteration = 146

Training on fold [4/5]


[I 2025-03-20 17:14:49,776] Trial 0 finished with value: 0.4012770572789268 and parameters: {'iterations': 495, 'depth': 5, 'learning_rate': 0.02032469933278671, 'l2_leaf_reg': 0.6052031073501143, 'border_count': 211, 'random_strength': 1.9248966215245957, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 22}. Best is trial 24 with value: 0.397811063928694.



bestTest = 0.4185231569
bestIteration = 494



[I 2025-03-20 17:14:53,211] Trial 39 finished with value: 0.40066458949064165 and parameters: {'iterations': 462, 'depth': 5, 'learning_rate': 0.09534466510724621, 'l2_leaf_reg': 0.4480029634338219, 'border_count': 130, 'random_strength': 0.5913190762763267, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 18}. Best is trial 24 with value: 0.397811063928694.
[I 2025-03-20 17:14:54,795] Trial 40 finished with value: 0.400139189647082 and parameters: {'iterations': 474, 'depth': 5, 'learning_rate': 0.09234479645216151, 'l2_leaf_reg': 0.7263638844493493, 'border_count': 141, 'random_strength': 0.5369954105637048, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 19}. Best is trial 24 with value: 0.397811063928694.
[I 2025-03-20 17:14:54,920] Trial 12 finished with value: 0.4100780707937576 and parameters: {'iterations': 329, 'depth': 9, 'learning_rate': 0.03600951182280689, 'l2_leaf_reg': 0.04682936724714622, 'border_count

Best Parameters: {'iterations': 657, 'depth': 7, 'learning_rate': 0.055239378192584745, 'l2_leaf_reg': 1.0018746025288165, 'border_count': 113, 'random_strength': 1.413719482197632, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 16}


In [4]:
final_model = CatBoostClassifier(**study.best_params, verbose=0)
final_model.fit(X_train, y_train)
print("Final Model Accuracy:", accuracy_score(y_val, final_model.predict(X_val)))
final_model.save_model("../model/final_model.cbm")

Final Model Accuracy: 0.8016101207590569


In [5]:
# Load test data
df_test = pd.read_csv("../data/test.csv")
passenger_ids = df_test["PassengerId"]
df_test.drop(["PassengerId", "Name"], axis=1, inplace=True, errors="ignore")

# Handle missing numerical values
for col in numerical_cols:
    df_test[col] = df_test[col].fillna(df_test[col].mean())

# Feature Engineering: Split "Cabin" into "Deck", "CabinNumber", "Side"
if "Cabin" in df_test.columns:
    df_test[["Deck", "CabinNumber", "Side"]] = df_test["Cabin"].str.split("/", expand=True)
    df_test.drop("Cabin", axis=1, inplace=True)
else:
    df_test["Deck"], df_test["CabinNumber"], df_test["Side"] = "Unknown", 0, "Unknown"

df_test["CabinNumber"] = pd.to_numeric(df_test["CabinNumber"], errors="coerce").fillna(0)

# Ensure all categorical columns exist
for col in categorical_cols:
    if col not in df_test.columns:
        df_test[col] = "Unknown"

# Apply target encoding using saved mappings (with fallback for unseen categories)
for col in categorical_cols:
    df_test[f"{col}_encoded"] = df_test[col].map(target_encoding_mappings[col]).fillna(y.mean())

df_test.drop(categorical_cols, axis=1, inplace=True)

# Feature Engineering
df_test["TotalSpending"] = df_test[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
df_test["Age_Spending"] = df_test["Age"] * df_test["TotalSpending"]

# Ensure you calculate "High_Cabin" before dropping "CabinNumber"
df_test["High_Cabin"] = (df_test["CabinNumber"] > df_test["CabinNumber"].median()).astype(int)

# Drop "CabinNumber" after creating "High_Cabin"
df_test.drop("CabinNumber", axis=1, inplace=True)


# Log Transformations
for col in spending_cols:
    df_test[f"{col}_log"] = np.log1p(df_test[col])
df_test.drop(spending_cols, axis=1, inplace=True)

# Ensure test data matches training data columns
df_test = df_test.reindex(columns=X_train.columns, fill_value=0)

print("Test Data Preprocessing Completed!")

Test Data Preprocessing Completed!


In [6]:
final_model = CatBoostClassifier()
final_model.load_model("../model/final_model.cbm")
predictions = final_model.predict(df_test)
submission = pd.DataFrame({"PassengerId": passenger_ids, "Transported": predictions})
submission.to_csv("../data/submission.csv", index=False)
print("Submission saved!")

Submission saved!
