In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import numpy as np
import optuna
from clearml import Task
import logging
import json
import os
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from category_encoders import TargetEncoder

# Configure logging
os.makedirs("../data", exist_ok=True)
logging.basicConfig(
    filename='../data/log_file.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.info("Logging initialized.")

In [2]:
# Load training data
logging.info("Loading training data...")
df = pd.read_csv("../data/train.csv")
logging.info(f"Training data loaded. Shape: {df.shape}")

# Drop unnecessary columns
logging.info("Dropping unnecessary columns...")
df.drop(["PassengerId", "Name"], axis=1, inplace=True, errors="ignore")
logging.info(f"Columns after dropping: {df.columns.tolist()}")

# Define numerical and categorical columns
numerical_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
categorical_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
logging.info(f"Numerical columns: {numerical_cols}")
logging.info(f"Categorical columns: {categorical_cols}")

# Handle missing values
logging.info("Handling missing values...")
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())
    logging.info(f"Filled missing values in {col} with mean: {df[col].mean()}")
for col in categorical_cols:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)
    logging.info(f"Filled missing values in {col} with mode: {mode_value}")

# Feature Engineering: Split "Cabin" into "Deck", "CabinNumber", "Side"
logging.info("Splitting 'Cabin' column...")
df[["Deck", "CabinNumber", "Side"]] = df["Cabin"].str.split("/", expand=True)
df.drop("Cabin", axis=1, inplace=True)
df["CabinNumber"] = pd.to_numeric(df["CabinNumber"], errors="coerce").fillna(0)
logging.info(f"New columns after splitting 'Cabin': {df.columns.tolist()}")

# Update categorical columns list
categorical_cols.extend(["Deck", "Side"])
logging.info(f"Updated categorical columns: {categorical_cols}")

# Target Encoding
logging.info("Applying target encoding...")
target_encoding_mappings = {col: df.groupby(col)["Transported"].mean() for col in categorical_cols}
for col in categorical_cols:
    df[f"{col}_encoded"] = df[col].map(target_encoding_mappings[col])
df.drop(categorical_cols, axis=1, inplace=True)
logging.info(f"Columns after target encoding: {df.columns.tolist()}")

# Feature Engineering
logging.info("Creating interaction features...")
df["TotalSpending"] = df[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
df["Age_Spending"] = df["Age"] * df["TotalSpending"]
df["High_Cabin"] = (df["CabinNumber"] > df["CabinNumber"].median()).astype(int)
df.drop("CabinNumber", axis=1, inplace=True)
logging.info(f"Columns after feature engineering: {df.columns.tolist()}")

# Log Transformations for Spending Columns
logging.info("Applying log transformations...")
spending_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
for col in spending_cols:
    df[f"{col}_log"] = np.log1p(df[col])
df.drop(spending_cols, axis=1, inplace=True)
logging.info(f"Columns after log transformations: {df.columns.tolist()}")

# Train-Test Split
logging.info("Splitting data into training and validation sets...")
X = df.drop("Transported", axis=1)
y = df["Transported"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
logging.info(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")

In [3]:
# Initialize ClearML Task
logging.info("Initializing ClearML task...")
task = Task.init(project_name="MLops_HW1", task_name="Optuna_HPO")
logging.info("ClearML task initialized.")

def objective(trial):
    try:
        logging.info(f"Starting trial {trial.number}...")
        # Hyperparameters to tune
        params = {
            "iterations": trial.suggest_int("iterations", 200, 1000),
            "depth": trial.suggest_int("depth", 4, 9),
            "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.2, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 5, log=True),
            "border_count": trial.suggest_int("border_count", 64, 255),
            "random_strength": trial.suggest_float("random_strength", 1e-3, 5, log=True),
            "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
            "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
            "loss_function": "Logloss",
            "verbose": False,
            "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 10, 50),
        }
        logging.info(f"Trial {trial.number} parameters: {params}")

        # Prepare the training pool for CatBoost
        train_pool = Pool(X_train, label=y_train)

        # Run cross-validation
        cv_results = cv(
            params=params,
            pool=train_pool,
            fold_count=5,
            shuffle=True,
            partition_random_seed=42,
            stratified=True,
            early_stopping_rounds=params["early_stopping_rounds"],
            verbose_eval=False,
        )
        logging.info(f"Trial {trial.number} CV results: {cv_results}")

        # Extract the best test metric (Logloss)
        best_metric = [col for col in cv_results.columns if "test" in col and "mean" in col][0]
        best_value = cv_results[best_metric].min()
        logging.info(f"Trial {trial.number} best metric: {best_metric}, value: {best_value}")
        return best_value

    except Exception as e:
        logging.error(f"Error in trial {trial.number}: {e}")
        return float("inf")  # Return a high value to indicate a failed trial

# Run Optuna with parallel trials
logging.info("Starting Optuna optimization...")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, n_jobs=2)  # Start with fewer trials and cores
logging.info("Optuna optimization complete.")

# Log and save the best parameters
best_params = study.best_params
logging.info(f"Best Parameters: {best_params}")
task.get_logger().report_text(f"Best Parameters: {best_params}")
with open("../mlops_hw1/best_params.json", "w") as f:
    json.dump(best_params, f)
logging.info("Best parameters saved to JSON file.")

ClearML Task: overwriting (reusing) task id=8fa16091c0d64f47b3fa58aab60abde5
2025-03-23 00:35:11,446 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/fde1cfe19b89449da6ba2b22765e31d4/experiments/8fa16091c0d64f47b3fa58aab60abde5/output/log


[I 2025-03-23 00:35:18,675] A new study created in memory with name: no-name-f528c837-6bbd-400f-b6cf-2c0a212943cb


Training on fold [0/5]
Training on fold [0/5]

bestTest = 0.40257424
bestIteration = 97

Training on fold [1/5]

bestTest = 0.4053509376
bestIteration = 136

Training on fold [1/5]

bestTest = 0.4120848449
bestIteration = 93

Training on fold [2/5]

bestTest = 0.4172694442
bestIteration = 104

Training on fold [2/5]

bestTest = 0.3699957122
bestIteration = 89

Training on fold [3/5]

bestTest = 0.4082643453
bestIteration = 74

Training on fold [4/5]


[I 2025-03-23 00:35:40,121] Trial 1 finished with value: 0.40234408633837904 and parameters: {'iterations': 568, 'depth': 6, 'learning_rate': 0.12196860804327339, 'l2_leaf_reg': 1.579809089407408, 'border_count': 130, 'random_strength': 1.4552971541336908, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 34}. Best is trial 1 with value: 0.40234408633837904.



bestTest = 0.413776512
bestIteration = 99


bestTest = 0.3774589196
bestIteration = 207

Training on fold [0/5]

bestTest = 0.4019672977
bestIteration = 201

Training on fold [1/5]

bestTest = 0.4108607488
bestIteration = 153

Training on fold [4/5]

bestTest = 0.4157837439
bestIteration = 152

Training on fold [2/5]

bestTest = 0.3732827495
bestIteration = 170

Training on fold [3/5]


[I 2025-03-23 00:35:55,046] Trial 0 finished with value: 0.4070703940362529 and parameters: {'iterations': 596, 'depth': 6, 'learning_rate': 0.04220021873417702, 'l2_leaf_reg': 0.0012427320601689133, 'border_count': 249, 'random_strength': 0.001352469403432699, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 29}. Best is trial 1 with value: 0.40234408633837904.



bestTest = 0.4191687704
bestIteration = 172

Training on fold [0/5]

bestTest = 0.4124393995
bestIteration = 61

Training on fold [1/5]

bestTest = 0.3950126692
bestIteration = 220

Training on fold [4/5]

bestTest = 0.4184781874
bestIteration = 78

Training on fold [2/5]


[I 2025-03-23 00:36:01,378] Trial 2 finished with value: 0.40129428560946556 and parameters: {'iterations': 461, 'depth': 4, 'learning_rate': 0.0636825766942291, 'l2_leaf_reg': 0.011856026180762574, 'border_count': 164, 'random_strength': 1.0400683377343107, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 12}. Best is trial 2 with value: 0.40129428560946556.



bestTest = 0.418375342
bestIteration = 151

Training on fold [0/5]

bestTest = 0.3820632737
bestIteration = 53

Training on fold [3/5]

bestTest = 0.4046207377
bestIteration = 50

Training on fold [4/5]


[I 2025-03-23 00:36:05,851] Trial 3 finished with value: 0.4084335936442022 and parameters: {'iterations': 327, 'depth': 5, 'learning_rate': 0.09406157776119957, 'l2_leaf_reg': 0.0015118150462235774, 'border_count': 218, 'random_strength': 0.011275556968637968, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 31}. Best is trial 2 with value: 0.40129428560946556.



bestTest = 0.4177357511
bestIteration = 52

Training on fold [0/5]

bestTest = 0.4235316542
bestIteration = 24

Training on fold [1/5]

bestTest = 0.4257152794
bestIteration = 29

Training on fold [2/5]

bestTest = 0.3985497388
bestIteration = 210

Training on fold [1/5]

bestTest = 0.3901793996
bestIteration = 25

Training on fold [3/5]

bestTest = 0.4061777564
bestIteration = 33

Training on fold [4/5]

bestTest = 0.4040497655
bestIteration = 287

Training on fold [2/5]


[I 2025-03-23 00:36:33,239] Trial 4 finished with value: 0.4173879521840071 and parameters: {'iterations': 762, 'depth': 9, 'learning_rate': 0.17385474869598416, 'l2_leaf_reg': 0.21159112345703837, 'border_count': 142, 'random_strength': 2.278561113035634, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 41}. Best is trial 2 with value: 0.40129428560946556.



bestTest = 0.4317176067
bestIteration = 24

Training on fold [0/5]

bestTest = 0.3704776951
bestIteration = 251

Training on fold [3/5]

bestTest = 0.4177997963
bestIteration = 36

Training on fold [1/5]

bestTest = 0.4276835846
bestIteration = 33

Training on fold [2/5]

bestTest = 0.3993778918
bestIteration = 223

Training on fold [4/5]

bestTest = 0.3789785214
bestIteration = 60

Training on fold [3/5]


[I 2025-03-23 00:37:01,010] Trial 5 finished with value: 0.39823299289006836 and parameters: {'iterations': 290, 'depth': 9, 'learning_rate': 0.03786010055600511, 'l2_leaf_reg': 0.042405878023386065, 'border_count': 133, 'random_strength': 0.478506896646785, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 24}. Best is trial 5 with value: 0.39823299289006836.



bestTest = 0.4165366771
bestIteration = 258

Training on fold [0/5]

bestTest = 0.4287176516
bestIteration = 30

Training on fold [4/5]


[I 2025-03-23 00:37:07,430] Trial 6 finished with value: 0.4223657282178178 and parameters: {'iterations': 708, 'depth': 8, 'learning_rate': 0.049846310830541474, 'l2_leaf_reg': 0.008400787288611843, 'border_count': 242, 'random_strength': 0.00875561669877461, 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 41}. Best is trial 5 with value: 0.39823299289006836.



bestTest = 0.446371795
bestIteration = 35

Training on fold [0/5]

bestTest = 0.4065089764
bestIteration = 74

Training on fold [1/5]

bestTest = 0.4029687798
bestIteration = 91

Training on fold [2/5]

bestTest = 0.3682173482
bestIteration = 96

Training on fold [3/5]

bestTest = 0.4000093357
bestIteration = 469

Training on fold [1/5]

bestTest = 0.4028804208
bestIteration = 119

Training on fold [4/5]


[I 2025-03-23 00:37:27,466] Trial 8 finished with value: 0.40134825113310235 and parameters: {'iterations': 422, 'depth': 8, 'learning_rate': 0.060491578419171355, 'l2_leaf_reg': 0.06845539096030959, 'border_count': 215, 'random_strength': 0.07449914481483448, 'bootstrap_type': 'MVS', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 44}. Best is trial 5 with value: 0.39823299289006836.



bestTest = 0.4180343803
bestIteration = 104

Training on fold [0/5]

bestTest = 0.4048449936
bestIteration = 333

Training on fold [1/5]

bestTest = 0.4112421482
bestIteration = 353

Training on fold [2/5]

bestTest = 0.4104156079
bestIteration = 380

Training on fold [2/5]

bestTest = 0.367394723
bestIteration = 641

Training on fold [3/5]

bestTest = 0.3736303565
bestIteration = 414

Training on fold [3/5]

bestTest = 0.3983881235
bestIteration = 496

Training on fold [4/5]


[I 2025-03-23 00:38:10,219] Trial 9 finished with value: 0.3994988076547271 and parameters: {'iterations': 642, 'depth': 4, 'learning_rate': 0.027162724021251212, 'l2_leaf_reg': 0.010019833565632661, 'border_count': 160, 'random_strength': 0.3678283088205616, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 42}. Best is trial 5 with value: 0.39823299289006836.



bestTest = 0.4149991972
bestIteration = 538



[I 2025-03-23 00:38:22,325] Trial 7 finished with value: 0.40037489282553923 and parameters: {'iterations': 719, 'depth': 6, 'learning_rate': 0.020561601461881383, 'l2_leaf_reg': 1.144883772640748, 'border_count': 80, 'random_strength': 0.011200674038948817, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 19}. Best is trial 5 with value: 0.39823299289006836.


Best Parameters: {'iterations': 290, 'depth': 9, 'learning_rate': 0.03786010055600511, 'l2_leaf_reg': 0.042405878023386065, 'border_count': 133, 'random_strength': 0.478506896646785, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 24}
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


In [4]:
# Train final model
logging.info("Training final model...")
final_model = CatBoostClassifier(**study.best_params, verbose=0)
final_model.fit(X_train, y_train)
logging.info("Final model trained.")

# Evaluate final model
val_accuracy = accuracy_score(y_val, final_model.predict(X_val))
logging.info(f"Final Model Accuracy: {val_accuracy}")
print("Final Model Accuracy:", val_accuracy)

# Save final model
logging.info("Saving final model...")
final_model.save_model("../model/final_model.cbm")
logging.info("Final model saved.")

Final Model Accuracy: 0.7981598619896493
2025-03-23 00:38:25,565 - clearml.frameworks - INFO - Found existing registered model id=ea8195242e9b4e0b983a49f29e9964cf [/home/yusuf/mlops_hw1/model/final_model.cbm] reusing it.


In [6]:
# Load test data
logging.info("Loading test data...")
df_test = pd.read_csv("../data/test.csv")
logging.info(f"Test data loaded. Shape: {df_test.shape}")

# Preprocess test data
logging.info("Preprocessing test data...")
passenger_ids = df_test["PassengerId"]
df_test.drop(["PassengerId", "Name"], axis=1, inplace=True, errors="ignore")

# Handle missing numerical values
for col in numerical_cols:
    df_test[col] = df_test[col].fillna(df_test[col].mean())
    logging.info(f"Filled missing values in {col} with mean: {df_test[col].mean()}")

# Feature Engineering: Split "Cabin" into "Deck", "CabinNumber", "Side"
if "Cabin" in df_test.columns:
    df_test[["Deck", "CabinNumber", "Side"]] = df_test["Cabin"].str.split("/", expand=True)
    df_test.drop("Cabin", axis=1, inplace=True)
else:
    df_test["Deck"], df_test["CabinNumber"], df_test["Side"] = "Unknown", 0, "Unknown"
    logging.info("'Cabin' column not found. Filled with default values.")

df_test["CabinNumber"] = pd.to_numeric(df_test["CabinNumber"], errors="coerce").fillna(0)

# Ensure all categorical columns exist
for col in categorical_cols:
    if col not in df_test.columns:
        df_test[col] = "Unknown"
        logging.info(f"Added missing categorical column: {col}")

# Apply target encoding using saved mappings (with fallback for unseen categories)
for col in categorical_cols:
    df_test[f"{col}_encoded"] = df_test[col].map(target_encoding_mappings[col]).fillna(y.mean())
    logging.info(f"Applied target encoding to {col}")

df_test.drop(categorical_cols, axis=1, inplace=True)

# Feature Engineering
df_test["TotalSpending"] = df_test[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
df_test["Age_Spending"] = df_test["Age"] * df_test["TotalSpending"]

# Ensure you calculate "High_Cabin" before dropping "CabinNumber"
df_test["High_Cabin"] = (df_test["CabinNumber"] > df_test["CabinNumber"].median()).astype(int)

# Drop "CabinNumber" after creating "High_Cabin"
df_test.drop("CabinNumber", axis=1, inplace=True)

# Log Transformations
for col in spending_cols:
    df_test[f"{col}_log"] = np.log1p(df_test[col])
df_test.drop(spending_cols, axis=1, inplace=True)

# Ensure test data matches training data columns
df_test = df_test.reindex(columns=X_train.columns, fill_value=0)
logging.info("Test data preprocessing completed.")

In [8]:
# Load final model
logging.info("Loading final model...")
final_model = CatBoostClassifier()
final_model.load_model("../model/final_model.cbm")
logging.info("Final model loaded.")

# Make predictions
logging.info("Making predictions on test data...")
predictions = final_model.predict(df_test)
logging.info("Predictions completed.")

# Save submission
logging.info("Saving submission file...")
submission = pd.DataFrame({"PassengerId": passenger_ids, "Transported": predictions})
submission.to_csv("../data/submission.csv", index=False)
logging.info("Submission file saved.")
print("Submission saved!")

Submission saved!
