In [14]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import numpy as np
import optuna
from clearml import Task
import logging
import json
import os
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from category_encoders import TargetEncoder

# Configure logging
os.makedirs("../data", exist_ok=True)
logging.basicConfig(
    filename='../data/log_file.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.info("Logging initialized.")

In [15]:
# Load training data
logging.info("Loading training data...")
df = pd.read_csv("../data/train.csv")
logging.info(f"Training data loaded. Shape: {df.shape}")

# Drop unnecessary columns
logging.info("Dropping unnecessary columns...")
df.drop(["PassengerId", "Name"], axis=1, inplace=True, errors="ignore")
logging.info(f"Columns after dropping: {df.columns.tolist()}")

# Define numerical and categorical columns
numerical_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
categorical_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
logging.info(f"Numerical columns: {numerical_cols}")
logging.info(f"Categorical columns: {categorical_cols}")

# Handle missing values
logging.info("Handling missing values...")
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())
    logging.info(f"Filled missing values in {col} with mean: {df[col].mean()}")
for col in categorical_cols:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)
    logging.info(f"Filled missing values in {col} with mode: {mode_value}")

# Feature Engineering: Split "Cabin" into "Deck", "CabinNumber", "Side"
logging.info("Splitting 'Cabin' column...")
df[["Deck", "CabinNumber", "Side"]] = df["Cabin"].str.split("/", expand=True)
df.drop("Cabin", axis=1, inplace=True)
df["CabinNumber"] = pd.to_numeric(df["CabinNumber"], errors="coerce").fillna(0)
logging.info(f"New columns after splitting 'Cabin': {df.columns.tolist()}")

# Update categorical columns list
categorical_cols.extend(["Deck", "Side"])
logging.info(f"Updated categorical columns: {categorical_cols}")

# Target Encoding
logging.info("Applying target encoding...")
target_encoding_mappings = {col: df.groupby(col)["Transported"].mean() for col in categorical_cols}
for col in categorical_cols:
    df[f"{col}_encoded"] = df[col].map(target_encoding_mappings[col])
df.drop(categorical_cols, axis=1, inplace=True)
logging.info(f"Columns after target encoding: {df.columns.tolist()}")

# Feature Engineering
logging.info("Creating interaction features...")
df["TotalSpending"] = df[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
df["Age_Spending"] = df["Age"] * df["TotalSpending"]
df["High_Cabin"] = (df["CabinNumber"] > df["CabinNumber"].median()).astype(int)
df.drop("CabinNumber", axis=1, inplace=True)
logging.info(f"Columns after feature engineering: {df.columns.tolist()}")

# Log Transformations for Spending Columns
logging.info("Applying log transformations...")
spending_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
for col in spending_cols:
    df[f"{col}_log"] = np.log1p(df[col])
df.drop(spending_cols, axis=1, inplace=True)
logging.info(f"Columns after log transformations: {df.columns.tolist()}")

# Train-Test Split
logging.info("Splitting data into training and validation sets...")
X = df.drop("Transported", axis=1)
y = df["Transported"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
logging.info(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")

FileNotFoundError: [Errno 2] No such file or directory: '../data/train.csv'

In [6]:
# Initialize ClearML Task
logging.info("Initializing ClearML task...")
task = Task.init(project_name="MLops_HW1", task_name="Optuna_HPO")
logging.info("ClearML task initialized.")

def objective(trial):
    try:
        logging.info(f"Starting trial {trial.number}...")
        # Hyperparameters to tune
        params = {
            "iterations": trial.suggest_int("iterations", 200, 1000),
            "depth": trial.suggest_int("depth", 4, 9),
            "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.2, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 5, log=True),
            "border_count": trial.suggest_int("border_count", 64, 255),
            "random_strength": trial.suggest_float("random_strength", 1e-3, 5, log=True),
            "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
            "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
            "loss_function": "Logloss",
            "verbose": False,
            "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 10, 50),
        }
        logging.info(f"Trial {trial.number} parameters: {params}")

        # Prepare the training pool for CatBoost
        train_pool = Pool(X_train, label=y_train)

        # Run cross-validation
        cv_results = cv(
            params=params,
            pool=train_pool,
            fold_count=5,
            shuffle=True,
            partition_random_seed=42,
            stratified=True,
            early_stopping_rounds=params["early_stopping_rounds"],
            verbose_eval=False,
        )
        logging.info(f"Trial {trial.number} CV results: {cv_results}")

        # Extract the best test metric (Logloss)
        best_metric = [col for col in cv_results.columns if "test" in col and "mean" in col][0]
        best_value = cv_results[best_metric].min()
        logging.info(f"Trial {trial.number} best metric: {best_metric}, value: {best_value}")
        return best_value

    except Exception as e:
        logging.error(f"Error in trial {trial.number}: {e}")
        return float("inf")  # Return a high value to indicate a failed trial

# Run Optuna with parallel trials
logging.info("Starting Optuna optimization...")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, n_jobs=2)  # Start with fewer trials and cores
logging.info("Optuna optimization complete.")

# Log and save the best parameters
best_params = study.best_params
logging.info(f"Best Parameters: {best_params}")
task.get_logger().report_text(f"Best Parameters: {best_params}")
with open("../mlops_hw1/best_params.json", "w") as f:
    json.dump(best_params, f)
logging.info("Best parameters saved to JSON file.")

ClearML Task: created new task id=1982f43fbd5b4f6d992d2c75c3697e82
2025-03-23 00:16:09,349 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/fde1cfe19b89449da6ba2b22765e31d4/experiments/1982f43fbd5b4f6d992d2c75c3697e82/output/log


[I 2025-03-23 00:16:13,886] A new study created in memory with name: no-name-483f890c-fc2b-4222-a78b-3d3badb6b9d3
[I 2025-03-23 00:16:13,957] Trial 1 finished with value: inf and parameters: {'iterations': 315, 'depth': 6, 'learning_rate': 0.07125776350426297, 'l2_leaf_reg': 0.6496412779475277, 'border_count': 83, 'random_strength': 0.03555754385324258, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 29}. Best is trial 1 with value: inf.


Training on fold [0/5]
Training on fold [0/5]

bestTest = 0.4319469444
bestIteration = 24

Training on fold [1/5]

bestTest = 0.4312308594
bestIteration = 14

Training on fold [2/5]

bestTest = 0.4072059785
bestIteration = 17

Training on fold [3/5]

bestTest = 0.3985376637
bestIteration = 541

Training on fold [1/5]

bestTest = 0.4333782513
bestIteration = 17

Training on fold [4/5]


[I 2025-03-23 00:16:40,714] Trial 2 finished with value: 0.43327320232187694 and parameters: {'iterations': 849, 'depth': 9, 'learning_rate': 0.07486984153809682, 'l2_leaf_reg': 0.0035206784572581243, 'border_count': 120, 'random_strength': 0.00251523069679985, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 21}. Best is trial 2 with value: 0.43327320232187694.



bestTest = 0.4574437028
bestIteration = 15

Training on fold [0/5]

bestTest = 0.3990133094
bestIteration = 135

Training on fold [1/5]

bestTest = 0.4056778838
bestIteration = 562

Training on fold [2/5]

bestTest = 0.416246562
bestIteration = 68

Training on fold [2/5]

bestTest = 0.3686307756
bestIteration = 103

Training on fold [3/5]

bestTest = 0.4016002407
bestIteration = 77

Training on fold [4/5]


[I 2025-03-23 00:17:02,899] Trial 3 finished with value: 0.40062215781801214 and parameters: {'iterations': 983, 'depth': 6, 'learning_rate': 0.05974420144231747, 'l2_leaf_reg': 0.6512301478618386, 'border_count': 238, 'random_strength': 0.008976949900411221, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 32}. Best is trial 3 with value: 0.40062215781801214.



bestTest = 0.413633943
bestIteration = 112

Training on fold [0/5]

bestTest = 0.3660784191
bestIteration = 651


bestTest = 0.4142866788
bestIteration = 85

Training on fold [1/5]
Training on fold [3/5]

bestTest = 0.4186271547
bestIteration = 82

Training on fold [2/5]

bestTest = 0.3844945593
bestIteration = 98

Training on fold [3/5]

bestTest = 0.3973920679
bestIteration = 581

Training on fold [4/5]

bestTest = 0.4108896737
bestIteration = 89

Training on fold [4/5]


[I 2025-03-23 00:17:43,698] Trial 0 finished with value: 0.3969901870793472 and parameters: {'iterations': 981, 'depth': 5, 'learning_rate': 0.026624118026934724, 'l2_leaf_reg': 1.7410889208547, 'border_count': 73, 'random_strength': 3.5645739616109258, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 23}. Best is trial 0 with value: 0.3969901870793472.
[I 2025-03-23 00:17:43,754] Trial 4 finished with value: 0.41099620499489103 and parameters: {'iterations': 557, 'depth': 8, 'learning_rate': 0.022755575982566664, 'l2_leaf_reg': 0.007642784881252323, 'border_count': 211, 'random_strength': 0.05750297407756639, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 13}. Best is trial 0 with value: 0.3969901870793472.



bestTest = 0.4152939223
bestIteration = 464

Training on fold [0/5]

bestTest = 0.4240974902
bestIteration = 87

Training on fold [0/5]

bestTest = 0.4213922626
bestIteration = 56

Training on fold [1/5]

bestTest = 0.4140895994
bestIteration = 59

Training on fold [2/5]

bestTest = 0.3811671971
bestIteration = 73

Training on fold [3/5]

bestTest = 0.4070386387
bestIteration = 51

Training on fold [1/5]

bestTest = 0.4079584182
bestIteration = 68

Training on fold [4/5]


[I 2025-03-23 00:18:02,997] Trial 5 finished with value: 0.41176719711758214 and parameters: {'iterations': 855, 'depth': 7, 'learning_rate': 0.1943826123529642, 'l2_leaf_reg': 0.20699926202665864, 'border_count': 236, 'random_strength': 3.679578476956208, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 34}. Best is trial 0 with value: 0.3969901870793472.



bestTest = 0.4257297839
bestIteration = 63

Training on fold [0/5]

bestTest = 0.4210198042
bestIteration = 122

Training on fold [1/5]

bestTest = 0.4256978695
bestIteration = 37

Training on fold [2/5]

bestTest = 0.4271504741
bestIteration = 82

Training on fold [2/5]

bestTest = 0.3855236764
bestIteration = 86

Training on fold [3/5]

bestTest = 0.4175666941
bestIteration = 92

Training on fold [4/5]


[I 2025-03-23 00:18:18,284] Trial 7 finished with value: 0.4178918830820435 and parameters: {'iterations': 425, 'depth': 9, 'learning_rate': 0.05975841584741659, 'l2_leaf_reg': 0.004781736041358598, 'border_count': 165, 'random_strength': 4.108798170170366, 'bootstrap_type': 'Bayesian', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 25}. Best is trial 0 with value: 0.3969901870793472.



bestTest = 0.4301463512
bestIteration = 78

Training on fold [0/5]

bestTest = 0.3897956383
bestIteration = 50

Training on fold [3/5]

bestTest = 0.4070392792
bestIteration = 73

Training on fold [1/5]

bestTest = 0.4189656558
bestIteration = 36

Training on fold [4/5]

bestTest = 0.4129286807
bestIteration = 69

Training on fold [2/5]


[I 2025-03-23 00:18:41,887] Trial 6 finished with value: 0.4155056123931117 and parameters: {'iterations': 955, 'depth': 9, 'learning_rate': 0.03458594506860304, 'l2_leaf_reg': 0.013783903318665691, 'border_count': 133, 'random_strength': 0.12384606538487868, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 49}. Best is trial 0 with value: 0.3969901870793472.



bestTest = 0.430898514
bestIteration = 43

Training on fold [0/5]

bestTest = 0.402306196
bestIteration = 388

Training on fold [1/5]

bestTest = 0.3697480128
bestIteration = 107

Training on fold [3/5]

bestTest = 0.40820895
bestIteration = 308

Training on fold [2/5]

bestTest = 0.407404861
bestIteration = 66

Training on fold [4/5]

bestTest = 0.3707853225
bestIteration = 442

Training on fold [3/5]


[I 2025-03-23 00:19:15,545] Trial 8 finished with value: 0.40402715267666095 and parameters: {'iterations': 204, 'depth': 9, 'learning_rate': 0.049411976791883395, 'l2_leaf_reg': 3.911163909236514, 'border_count': 251, 'random_strength': 0.1387797892800934, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'early_stopping_rounds': 23}. Best is trial 0 with value: 0.3969901870793472.



bestTest = 0.4183180794
bestIteration = 70

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


[I 2025-03-23 00:19:22,751] Trial 9 finished with value: 0.4008614009970793 and parameters: {'iterations': 667, 'depth': 7, 'learning_rate': 0.036349819221252896, 'l2_leaf_reg': 2.192260386231003, 'border_count': 194, 'random_strength': 1.657179570515151, 'bootstrap_type': 'Bayesian', 'grow_policy': 'SymmetricTree', 'early_stopping_rounds': 44}. Best is trial 0 with value: 0.3969901870793472.


Best Parameters: {'iterations': 981, 'depth': 5, 'learning_rate': 0.026624118026934724, 'l2_leaf_reg': 1.7410889208547, 'border_count': 73, 'random_strength': 3.5645739616109258, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'early_stopping_rounds': 23}


In [8]:
# Train final model
logging.info("Training final model...")
final_model = CatBoostClassifier(**study.best_params, verbose=0)
final_model.fit(X_train, y_train)
logging.info("Final model trained.")

# Evaluate final model
val_accuracy = accuracy_score(y_val, final_model.predict(X_val))
logging.info(f"Final Model Accuracy: {val_accuracy}")
print("Final Model Accuracy:", val_accuracy)

# Save final model
logging.info("Saving final model...")
final_model.save_model("./model/final_model.cbm")
logging.info("Final model saved.")

Final Model Accuracy: 0.8027602070155262
2025-03-23 00:21:35,328 - clearml.frameworks - INFO - Found existing registered model id=ea8195242e9b4e0b983a49f29e9964cf [/home/yusuf/mlops_hw1/model/final_model.cbm] reusing it.


In [10]:
# Load test data
logging.info("Loading test data...")
df_test = pd.read_csv("./data/test.csv")
logging.info(f"Test data loaded. Shape: {df_test.shape}")

# Preprocess test data
logging.info("Preprocessing test data...")
passenger_ids = df_test["PassengerId"]
df_test.drop(["PassengerId", "Name"], axis=1, inplace=True, errors="ignore")

# Handle missing numerical values
for col in numerical_cols:
    df_test[col] = df_test[col].fillna(df_test[col].mean())
    logging.info(f"Filled missing values in {col} with mean: {df_test[col].mean()}")

# Feature Engineering: Split "Cabin" into "Deck", "CabinNumber", "Side"
if "Cabin" in df_test.columns:
    df_test[["Deck", "CabinNumber", "Side"]] = df_test["Cabin"].str.split("/", expand=True)
    df_test.drop("Cabin", axis=1, inplace=True)
else:
    df_test["Deck"], df_test["CabinNumber"], df_test["Side"] = "Unknown", 0, "Unknown"
    logging.info("'Cabin' column not found. Filled with default values.")

df_test["CabinNumber"] = pd.to_numeric(df_test["CabinNumber"], errors="coerce").fillna(0)

# Ensure all categorical columns exist
for col in categorical_cols:
    if col not in df_test.columns:
        df_test[col] = "Unknown"
        logging.info(f"Added missing categorical column: {col}")

# Apply target encoding using saved mappings (with fallback for unseen categories)
for col in categorical_cols:
    df_test[f"{col}_encoded"] = df_test[col].map(target_encoding_mappings[col]).fillna(y.mean())
    logging.info(f"Applied target encoding to {col}")

df_test.drop(categorical_cols, axis=1, inplace=True)

# Feature Engineering
df_test["TotalSpending"] = df_test[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
df_test["Age_Spending"] = df_test["Age"] * df_test["TotalSpending"]

# Ensure you calculate "High_Cabin" before dropping "CabinNumber"
df_test["High_Cabin"] = (df_test["CabinNumber"] > df_test["CabinNumber"].median()).astype(int)

# Drop "CabinNumber" after creating "High_Cabin"
df_test.drop("CabinNumber", axis=1, inplace=True)

# Log Transformations
for col in spending_cols:
    df_test[f"{col}_log"] = np.log1p(df_test[col])
df_test.drop(spending_cols, axis=1, inplace=True)

# Ensure test data matches training data columns
df_test = df_test.reindex(columns=X_train.columns, fill_value=0)
logging.info("Test data preprocessing completed.")

In [12]:
# Load final model
logging.info("Loading final model...")
final_model = CatBoostClassifier()
final_model.load_model("./model/final_model.cbm")
logging.info("Final model loaded.")

# Make predictions
logging.info("Making predictions on test data...")
predictions = final_model.predict(df_test)
logging.info("Predictions completed.")

# Save submission
logging.info("Saving submission file...")
submission = pd.DataFrame({"PassengerId": passenger_ids, "Transported": predictions})
submission.to_csv("../data/submission.csv", index=False)
logging.info("Submission file saved.")
print("Submission saved!")

Submission saved!
