<a href="https://colab.research.google.com/github/vanshuwjoshi/Kaggle-Playground-S4E11/blob/main/Optuna_CatBoost_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [3]:
data = pd.read_csv("/content/cleaned_v1.csv")
data.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Depression,Gender_Female,Gender_Male,Age_Group,City_encoded,Student,Profession_encoded,suicidal_thoughts,family_history
0,5.0,-1.0,2.0,8.0,0.0,2.0,1.0,2.0,0,1.0,0.0,3,0.19269,0,0.048567,0,0
1,4.0,-1.0,3.0,5.0,2.0,2.0,7.0,3.0,1,0.0,1.0,0,0.142206,0,0.055649,1,0
2,5.0,9.0,2.0,6.0,0.0,2.0,3.0,1.0,1,0.0,1.0,1,0.159196,1,0.585061,1,0
3,5.0,-1.0,1.0,5.0,1.0,2.0,10.0,1.0,1,0.0,1.0,0,0.131293,0,0.055649,1,1
4,1.0,-1.0,1.0,6.0,2.0,2.0,9.0,4.0,0,1.0,0.0,1,0.125739,0,0.05661,1,1


In [4]:
data.drop(["Gender_Male"], axis=1, inplace=True)
y = data["Depression"]
X = data.drop(["Depression"], axis=1)

In [5]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [6]:
import optuna

In [7]:
def objective_all_metrics(trial):
    # Hyperparameter ranges
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'task_type': 'GPU',
        'random_seed': 42,
        'verbose': 0
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies, roc_aucs, f1_scores = [], [], []

    for train_index, val_index in skf.split(X, y):
        X_strat_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_strat_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Train XGBoost model
        model = CatBoostClassifier(**param)
        model.fit(X_strat_train, y_strat_train,
                  eval_set=(X_val, y_val),
                  early_stopping_rounds=50,
                  verbose=False)

        # Predictions and probabilities
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]

        # Calculate metrics
        accuracies.append(accuracy_score(y_val, y_pred))
        roc_aucs.append(roc_auc_score(y_val, y_proba))
        f1_scores.append(f1_score(y_val, y_pred))

    # Store metrics
    trial.set_user_attr("accuracy", sum(accuracies) / len(accuracies))
    trial.set_user_attr("roc_auc", sum(roc_aucs) / len(roc_aucs))
    trial.set_user_attr("f1_score", sum(f1_scores) / len(f1_scores))

    # Optimize for one metric (e.g., ROC-AUC)
    return sum(roc_aucs) / len(roc_aucs)

In [8]:
# Optimize the study for ROC-AUC
study = optuna.create_study(direction="maximize")  # Maximize ROC-AUC
study.optimize(objective_all_metrics, n_trials=50)

[I 2024-11-26 17:26:32,879] A new study created in memory with name: no-name-d89c1b33-4e12-4e75-8192-6584a8652679
[I 2024-11-26 17:26:44,676] Trial 0 finished with value: 0.9738779199274232 and parameters: {'iterations': 312, 'learning_rate': 0.10234743715826009, 'depth': 9, 'l2_leaf_reg': 2.497495030121348, 'border_count': 96}. Best is trial 0 with value: 0.9738779199274232.
[I 2024-11-26 17:26:52,886] Trial 1 finished with value: 0.9741694831845189 and parameters: {'iterations': 889, 'learning_rate': 0.23651120292755548, 'depth': 7, 'l2_leaf_reg': 3.6181086432910043, 'border_count': 85}. Best is trial 1 with value: 0.9741694831845189.
[I 2024-11-26 17:26:58,719] Trial 2 finished with value: 0.9739809873169907 and parameters: {'iterations': 601, 'learning_rate': 0.25599366031923176, 'depth': 8, 'l2_leaf_reg': 2.380592990569365, 'border_count': 85}. Best is trial 1 with value: 0.9741694831845189.
[I 2024-11-26 17:27:08,203] Trial 3 finished with value: 0.9744605716281809 and parameters

In [9]:
# Display results
best_trial = study.best_trial
print(f"Best Parameters: {best_trial.params}")
print(f"Best ROC-AUC: {best_trial.value}")
print(f"Associated Accuracy: {best_trial.user_attrs['accuracy']}")
print(f"Associated F1-Score: {best_trial.user_attrs['f1_score']}")

Best Parameters: {'iterations': 754, 'learning_rate': 0.03847013908859086, 'depth': 4, 'l2_leaf_reg': 7.0030102514762405, 'border_count': 197}
Best ROC-AUC: 0.974575515707877
Associated Accuracy: 0.9381592039800994
Associated F1-Score: 0.826500087686985
