In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import numpy as np
import optuna
from clearml import Task
from clearml.automation.optuna import OptimizerOptuna
from clearml.automation import (
    UniformIntegerParameterRange, 
    UniformParameterRange, 
    DiscreteParameterRange,
    HyperParameterOptimizer
)
import logging
import json
import os
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from category_encoders import TargetEncoder
from datetime import datetime

# Configure logging
os.makedirs("../data", exist_ok=True)
logging.basicConfig(
    filename='../data/log_file.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.info("Logging initialized.")

In [2]:
# Load training data
logging.info("Loading training data...")
df = pd.read_csv("../data/train.csv")
logging.info(f"Training data loaded. Shape: {df.shape}")

# Drop unnecessary columns
logging.info("Dropping unnecessary columns...")
df.drop(["PassengerId", "Name"], axis=1, inplace=True, errors="ignore")
logging.info(f"Columns after dropping: {df.columns.tolist()}")

# Define numerical and categorical columns
numerical_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
categorical_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
logging.info(f"Numerical columns: {numerical_cols}")
logging.info(f"Categorical columns: {categorical_cols}")

# Handle missing values
logging.info("Handling missing values...")
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())
    logging.info(f"Filled missing values in {col} with mean: {df[col].mean()}")
for col in categorical_cols:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)
    logging.info(f"Filled missing values in {col} with mode: {mode_value}")

# Feature Engineering: Split "Cabin" into "Deck", "CabinNumber", "Side"
logging.info("Splitting 'Cabin' column...")
df[["Deck", "CabinNumber", "Side"]] = df["Cabin"].str.split("/", expand=True)
df.drop("Cabin", axis=1, inplace=True)
df["CabinNumber"] = pd.to_numeric(df["CabinNumber"], errors="coerce").fillna(0)
logging.info(f"New columns after splitting 'Cabin': {df.columns.tolist()}")

# Update categorical columns list
categorical_cols.extend(["Deck", "Side"])
logging.info(f"Updated categorical columns: {categorical_cols}")

# Target Encoding
logging.info("Applying target encoding...")
target_encoding_mappings = {col: df.groupby(col)["Transported"].mean() for col in categorical_cols}
for col in categorical_cols:
    df[f"{col}_encoded"] = df[col].map(target_encoding_mappings[col])
df.drop(categorical_cols, axis=1, inplace=True)
logging.info(f"Columns after target encoding: {df.columns.tolist()}")

# Feature Engineering
logging.info("Creating interaction features...")
df["TotalSpending"] = df[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
df["Age_Spending"] = df["Age"] * df["TotalSpending"]
df["High_Cabin"] = (df["CabinNumber"] > df["CabinNumber"].median()).astype(int)
df.drop("CabinNumber", axis=1, inplace=True)
logging.info(f"Columns after feature engineering: {df.columns.tolist()}")

# Log Transformations for Spending Columns
logging.info("Applying log transformations...")
spending_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
for col in spending_cols:
    df[f"{col}_log"] = np.log1p(df[col])
df.drop(spending_cols, axis=1, inplace=True)
logging.info(f"Columns after log transformations: {df.columns.tolist()}")

# Train-Test Split
logging.info("Splitting data into training and validation sets...")
X = df.drop("Transported", axis=1)
y = df["Transported"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
logging.info(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")

In [None]:
# Initialize ClearML Task
logging.info("Initializing ClearML task for HPO...")
task = Task.init(
    project_name="MLops_HW1",
    task_name="Optuna_HPO",
    task_type=Task.TaskTypes.optimizer
)
logging.info("ClearML task initialized.")

# Define hyperparameter search space
hyper_parameters = [
    UniformIntegerParameterRange("iterations", min_value=200, max_value=1000),
    UniformIntegerParameterRange("depth", min_value=4, max_value=9),
    UniformIntegerParameterRange("border_count", min_value=64, max_value=255),
    UniformIntegerParameterRange("early_stopping_rounds", min_value=10, max_value=50),
    DiscreteParameterRange("bootstrap_type", values=["Bayesian", "Bernoulli", "MVS"]),
    DiscreteParameterRange("grow_policy", values=["SymmetricTree", "Depthwise", "Lossguide"]),
]
# Determine which search strategy to use
try:
    from clearml.automation.optuna import OptimizerOptuna
    optimizer_class = OptimizerOptuna
    logging.info("Using Optuna optimizer")
except ImportError:
    optimizer_class = RandomSearch
    logging.info("Optuna not available, using RandomSearch")

# Create optimizer
optimizer = HyperParameterOptimizer(
    base_task_id=task.id,
    hyper_parameters=hyper_parameters,
    objective_metric_title="Loss/Validation Logloss",
    objective_metric_series="Loss/Validation Logloss",
    objective_metric_sign="min",
    optimizer_class=optimizer_class,  # Use the determined optimizer class
    execution_queue="default",
    max_number_of_concurrent_tasks=2,
    total_max_jobs=20,
    min_iteration_per_job=5,
    max_iteration_per_job=10,
)

logging.info("Starting optimization...")
optimizer.start()
optimizer.set_time_limit(in_minutes=60)  # 1 hour limit
optimizer.wait()
optimizer.stop()
logging.info("Optimization complete.")

# Save best parameters
try:
    top_experiments = optimizer.get_top_experiments(top_k=1)
    if top_experiments:
        best_params = top_experiments[0].get_parameters()
        logging.info(f"Best Parameters: {best_params}")
        task.get_logger().report_text(f"Best Parameters: {best_params}")

        os.makedirs("../mlops_hw1", exist_ok=True)
        with open("../mlops_hw1/best_params.json", "w") as f:
            json.dump(best_params, f)
        logging.info("Best parameters saved to JSON file.")
    else:
        logging.warning("No experiments completed successfully")
except Exception as e:
    logging.error(f"Failed to save best parameters: {str(e)}")

ClearML Task: overwriting (reusing) task id=4cc0064454664ab891be4ef1e3a684e2
2025-03-27 19:51:02,750 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/fde1cfe19b89449da6ba2b22765e31d4/experiments/4cc0064454664ab891be4ef1e3a684e2/output/log


[I 2025-03-27 19:51:15,639] A new study created in memory with name: 4cc0064454664ab891be4ef1e3a684e2


Progress report #0 completed, sleeping for 0.25 minutes
2025-03-27 19:51:25,252 - clearml.automation.optimization - INFO - Creating new Task: {'iterations': 634, 'depth': 8, 'border_count': 200, 'early_stopping_rounds': 38, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise'}
2025-03-27 19:51:32,355 - clearml.automation.optimization - INFO - Creating new Task: {'iterations': 797, 'depth': 8, 'border_count': 190, 'early_stopping_rounds': 44, 'bootstrap_type': 'Bayesian', 'grow_policy': 'SymmetricTree'}
Progress report #1 completed, sleeping for 5.0 minutes


In [None]:
# Train final model
logging.info("Training final model...")
final_model = CatBoostClassifier(**study.best_params, verbose=0)
final_model.fit(X_train, y_train)
logging.info("Final model trained.")

# Ensure the directory exists before saving the model
os.makedirs("../model", exist_ok=True)

# Generate a unique model name
model_name = f"final_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
model_path = f"../model/{model_name}.cbm"

# Save final model
final_model.save_model(model_path)
logging.info(f"Final model saved to {model_path}")

# Ensure task is in the correct state before uploading
if task.get_status() != "in_progress":
    task.set_status("in_progress")

# Remove previous artifacts if necessary
task.unregister_artifact("final_model")

# Upload model to ClearML
task.upload_artifact("final_model", artifact_object=model_path)

# Evaluate final model
val_accuracy = accuracy_score(y_val, final_model.predict(X_val))
logging.info(f"Final Model Accuracy: {val_accuracy}")
print("Final Model Accuracy:", val_accuracy)

# Close ClearML task
task.close()
logging.info("ClearML task closed.")