In [1]:
!git clone https://github.com/yugan243/Depression-Analysis.git


Cloning into 'Depression-Analysis'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 44 (delta 19), reused 30 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (44/44), 15.70 MiB | 11.45 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [2]:
!cd /content/Depression-Analysis/Notebooks

In [3]:
!pip install -q optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/247.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


### 1. Import Dependencies

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import optuna
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import warnings
import joblib

warnings.filterwarnings('ignore')

In [6]:
import sys

# Add the project's root directory to the system path
# This allows the notebook to find the 'Scripts' package
project_path = '/content/Depression-Analysis/Notebooks'
if project_path not in sys.path:
    sys.path.append(project_path)

### 2. Load Data

In [7]:
train_df = pd.read_csv('/content/Depression-Analysis/Data/Processed/DP_train_encoded.csv')

### 3. Split the data into Train and Test

In [8]:
X = train_df.drop(columns=['Depression', 'id'])
Y = train_df['Depression']


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### 4. Model Training

#### 4.1 Random Forest Classifier with Cross Validation

In [None]:
model_rfc = RandomForestClassifier(
                                    n_estimators=200,
                                    max_depth=None,
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    random_state=42,
                                    n_jobs=-1
                                    )

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(model_rfc, X_train, Y_train, cv=skf, scoring='accuracy', n_jobs=-1)

In [None]:
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores):.4f}")
print(f"Std deviation: {np.std(cv_scores):.4f}")

# 5️⃣ Train final model on full training data
model_rfc.fit(X_train, Y_train)

# 6️⃣ Predict on test set
y_pred = model_rfc.predict(X_test)

Cross-validation scores: [0.93650441 0.93765852 0.93832599 0.93583411 0.93574512]
Mean accuracy: 0.9368
Std deviation: 0.0010


#### Let's use grid search cv for hyperparameter tuning

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
model_rfc_grid = RandomForestClassifier(random_state=42, n_jobs=-1)

param_grid = {
            "n_estimators": [100, 200, 300],
            "max_depth": [None, 10, 15, 20],
            "min_samples_split": [5, 10],
            "min_samples_leaf": [2, 4]
            }

In [None]:
grid_search = GridSearchCV(
                          estimator=model_rfc_grid,
                          param_grid=param_grid,
                          scoring="accuracy",
                          cv=skf,
                          n_jobs=-1,
                          verbose=2
                          )

grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)
best_model = grid_search.best_estimator_


Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}
Best CV Accuracy: 0.9372230002909644


In [None]:
y_pred = best_model.predict(X_test)

In [None]:
test_accuracy = accuracy_score(Y_test, y_pred)
print("Test Accuracy:", test_accuracy)

print("\nClassification Report:\n", classification_report(Y_test, y_pred))

Test Accuracy: 0.9380250605154492

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96     23019
           1       0.85      0.80      0.82      5073

    accuracy                           0.94     28092
   macro avg       0.90      0.88      0.89     28092
weighted avg       0.94      0.94      0.94     28092



### 4.2 XGBoost with Cross validation + Hyperparameter tuning with optuna

In [None]:
def objective(trial):
    # Suggest hyperparameters
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "scale_pos_weight": 1,
        "random_state": 42,
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
    }

  # Model
    model = XGBClassifier(**param)

    # Stratified K-Fold CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, Y_train, cv=skf, scoring='accuracy')

    return scores.mean()

#Run optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best Trial:")
print(study.best_trial.params)


# Train Final Model with Best Params
best_params = study.best_trial.params
final_model = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
final_model.fit(X_train, Y_train)

[I 2025-08-16 13:07:00,996] A new study created in memory with name: no-name-ad35cc4b-aeaa-46c3-abee-1bab5f4a36bb


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-08-16 13:07:07,979] Trial 0 finished with value: 0.93870032270063 and parameters: {'n_estimators': 494, 'max_depth': 7, 'learning_rate': 0.04427290271447705, 'subsample': 0.85604946838349, 'colsample_bytree': 0.9858168975968153, 'gamma': 1.3954664725193877, 'min_child_weight': 1}. Best is trial 0 with value: 0.93870032270063.
[I 2025-08-16 13:07:10,331] Trial 1 finished with value: 0.9388872061125344 and parameters: {'n_estimators': 445, 'max_depth': 3, 'learning_rate': 0.22473796378949956, 'subsample': 0.5880581262285065, 'colsample_bytree': 0.7341337573345873, 'gamma': 4.942161235941965, 'min_child_weight': 9}. Best is trial 1 with value: 0.9388872061125344.
[I 2025-08-16 13:07:13,949] Trial 2 finished with value: 0.9384778326993688 and parameters: {'n_estimators': 868, 'max_depth': 6, 'learning_rate': 0.1569052882308259, 'subsample': 0.9851433298762182, 'colsample_bytree': 0.5123805498041354, 'gamma': 4.477874628374503, 'min_child_weight': 9}. Best is trial 1 with value: 0.9

In [None]:
y_pred = final_model.predict(X_test)
test_acc = accuracy_score(Y_test, y_pred)
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.9402


In [None]:
model_path = "models/xgb_final_model.pkl"
joblib.dump(final_model, model_path)

print(f"Model saved to {model_path}")

Model saved to models/xgb_final_model.pkl


Maximum result gain with 50 optuna trials

### 4.3 CatBoost with Cross validation + Hyperparameter tuning with optuna

In [15]:
def objective(trial):
    # Suggest hyperparameters
    param = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "random_state": 42,
        "verbose": 0,
        "task_type": "GPU",
        "devices": "0",
    }

    # Initialize CatBoost model
    model = CatBoostClassifier(**param)

    # Stratified K-Fold CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, Y_train, cv=skf, scoring='accuracy')

    return scores.mean()

# Run the optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best trial:")
print(study.best_trial.params)

[I 2025-08-16 14:04:18,856] A new study created in memory with name: no-name-18f65c5a-cf4b-4f05-8cb2-a6909de818cd


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-08-16 14:04:55,557] Trial 0 finished with value: 0.9389584050354657 and parameters: {'iterations': 511, 'depth': 9, 'learning_rate': 0.022037947288785976, 'l2_leaf_reg': 2.4811433207258724, 'border_count': 229, 'random_strength': 9.150364759394414, 'bagging_temperature': 0.36880588054671826}. Best is trial 0 with value: 0.9389584050354657.
[I 2025-08-16 14:05:48,297] Trial 1 finished with value: 0.9386736228075353 and parameters: {'iterations': 414, 'depth': 10, 'learning_rate': 0.017444087300226327, 'l2_leaf_reg': 4.644109394075595, 'border_count': 212, 'random_strength': 1.5838858445977935, 'bagging_temperature': 0.7881157142825866}. Best is trial 0 with value: 0.9389584050354657.
[I 2025-08-16 14:06:04,329] Trial 2 finished with value: 0.9385846318757535 and parameters: {'iterations': 475, 'depth': 4, 'learning_rate': 0.025736417115103466, 'l2_leaf_reg': 7.597547762025554, 'border_count': 230, 'random_strength': 0.17047223247592241, 'bagging_temperature': 0.34620794323746296

In [16]:
# train the catboost model
best_params = study.best_trial.params
final_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)
final_model.fit(X_train, Y_train)


<catboost.core.CatBoostClassifier at 0x7bf5bd545c10>

In [17]:
# Evaluate the model
accuracy = final_model.score(X_train, Y_train)
print(f"Train Accuracy: {accuracy:.4f}")

Train Accuracy: 0.9434


In [22]:
%cd /content/Depression-Analysis
final_model.save_model("models/catboost_model.cbm")



/content/Depression-Analysis


maximum result of 0.9434 result gain with the 50 optuna trals
