In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("../data/cleaned_v1.csv")
data.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Depression,Gender_Female,Gender_Male,Age_Group,City_encoded,Student,Profession_encoded,suicidal_thoughts,family_history
0,5.0,-1.0,2.0,8.0,0.0,2.0,1.0,2.0,0,1.0,0.0,3,0.19269,0,0.048567,0,0
1,4.0,-1.0,3.0,5.0,2.0,2.0,7.0,3.0,1,0.0,1.0,0,0.142206,0,0.055649,1,0
2,5.0,9.0,2.0,6.0,0.0,2.0,3.0,1.0,1,0.0,1.0,1,0.159196,1,0.585061,1,0
3,5.0,-1.0,1.0,5.0,1.0,2.0,10.0,1.0,1,0.0,1.0,0,0.131293,0,0.055649,1,1
4,1.0,-1.0,1.0,6.0,2.0,2.0,9.0,4.0,0,1.0,0.0,1,0.125739,0,0.05661,1,1


In [3]:
data.drop(["Gender_Male"], axis=1, inplace=True)
y = data["Depression"]
X = data.drop(["Depression"], axis=1)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = (
        model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    )

    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    if y_prob is not None:
        print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob)}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)} ")

In [6]:
best_params = {
    "eval_metric": "logloss",
    "subsample": 0.5,
    "n_estimators": 332,
    "min_child_weight": 3,
    "max_depth": 3,
    "learning_rate": 0.074972461438185,
    "colsample_bytree": 0.7,
}
xgboost_tuned = XGBClassifier(objective="binary:logistic", **best_params)
xgboost_tuned.fit(X_train, y_train)
evaluate_model(xgboost_tuned, X_test, y_test)

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     22986
           1       0.84      0.81      0.83      5154

    accuracy                           0.94     28140
   macro avg       0.90      0.89      0.89     28140
weighted avg       0.94      0.94      0.94     28140

ROC-AUC Score: 0.9759520110451061
Confusion Matrix:
[[22212   774]
 [  973  4181]]
Accuracy score: 0.9379175550817341 


In [7]:
params = {
    "random_strength": 5,
    "learning_rate": 0.03847013908859086,
    "l2_leaf_reg": 7.0030102514762405,
    "iterations": 754,
    "depth": 4,
    "border_count": 197,
    "bagging_temperature": 0,
}

In [6]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from catboost import CatBoostClassifier

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
models = []
accuracy_scores = []
for train_index, val_index in skf.split(X, y):
    # Split the data into train and validation sets
    X_strat_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_strat_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Initialize CatBoost Classifier
    cat_model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        verbose=0,  # Suppress training output
        random_seed=42,
        **params,
    )

    # Fit the model
    cat_model.fit(
        X_strat_train, y_strat_train, eval_set=(X_val, y_val), use_best_model=True
    )

    models.append(cat_model)

    # Make predictions
    y_pred = cat_model.predict(X_val)

    # Calculate accuracy
    acc = accuracy_score(y_val, y_pred)
    accuracy_scores.append(acc)
    print(f"Fold Accuracy: {acc}")

Fold Accuracy: 0.938272921108742
Fold Accuracy: 0.9372423596304194
Fold Accuracy: 0.9382018479033405
Fold Accuracy: 0.9404051172707889
Fold Accuracy: 0.9380597014925374


In [10]:
mean_accuracy = np.mean(accuracy_scores)
print(f"\nMean Accuracy Across {5} Folds: {mean_accuracy}")


Mean Accuracy Across 5 Folds: 0.9384363894811656


In [7]:
from sklearn.ensemble import StackingClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
params_catboost = {
    "random_strength": 5,
    "learning_rate": 0.03847013908859086,
    "l2_leaf_reg": 7.0030102514762405,
    "iterations": 754,
    "depth": 4,
    "border_count": 197,
    "bagging_temperature": 0,
}
params_xgb = {
    "eval_metric": "logloss",
    "subsample": 0.5,
    "n_estimators": 332,
    "min_child_weight": 3,
    "max_depth": 3,
    "learning_rate": 0.074972461438185,
    "colsample_bytree": 0.7,
}
params_logistic = {
    "C": 3.6348739977385867,
    "max_iter": 5000,
    "penalty": "l2",
    "solver": "newton-cg",
}

In [13]:
xgb_model = XGBClassifier(objective="binary:logistic", **params_xgb)
cat_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    verbose=0,  # Suppress training output
    random_seed=42,
    **params_catboost,
)
log_reg = LogisticRegression(**params_logistic)

# Define the stacking classifier
stacking_model = StackingClassifier(
    estimators=[("xgb", xgb_model), ("cat", cat_model), ("logreg", log_reg)],
    final_estimator=LogisticRegression(),
    cv=5,  # Use 5-fold cross-validation
)

In [14]:
stacking_model.fit(X_train, y_train)

In [15]:
# Evaluate on the test set
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Stacking Classifier Accuracy:", accuracy)

Stacking Classifier Accuracy: 0.937775408670931


In [8]:
params_catboost = {
    "random_strength": 5,
    "learning_rate": 0.03847013908859086,
    "l2_leaf_reg": 7.0030102514762405,
    "iterations": 754,
    "depth": 4,
    "border_count": 197,
    "bagging_temperature": 0,
}
cat_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    verbose=0,  # Suppress training output
    random_seed=42,
    **params_catboost,
)
stacking_model = StackingClassifier(
    estimators=[("cat", cat_model)],
    final_estimator=cat_model,
    cv=5,  # Use 5-fold cross-validation
)
stacking_model.fit(X_train, y_train)

In [9]:
# Evaluate on the test set
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Stacking Classifier Accuracy:", accuracy)

Stacking Classifier Accuracy: 0.9384506041222459


In [10]:
params_xgb = {
    "eval_metric": "logloss",
    "subsample": 0.5,
    "n_estimators": 332,
    "min_child_weight": 3,
    "max_depth": 3,
    "learning_rate": 0.074972461438185,
    "colsample_bytree": 0.7,
}

xgb_model = XGBClassifier(objective="binary:logistic", **params_xgb)

# Define the stacking classifier
stacking_model = StackingClassifier(
    estimators=[("xgb", xgb_model)],
    final_estimator=xgb_model,
    cv=5,  # Use 5-fold cross-validation
)

In [11]:
stacking_model.fit(X_train, y_train)

In [12]:
# Evaluate on the test set
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Stacking Classifier Accuracy:", accuracy)

Stacking Classifier Accuracy: 0.9378109452736318


In [13]:
params_catboost = {
    "random_strength": 5,
    "learning_rate": 0.03847013908859086,
    "l2_leaf_reg": 7.0030102514762405,
    "iterations": 754,
    "depth": 4,
    "border_count": 197,
    "bagging_temperature": 0,
}
params_xgb = {
    "eval_metric": "logloss",
    "subsample": 0.5,
    "n_estimators": 332,
    "min_child_weight": 3,
    "max_depth": 3,
    "learning_rate": 0.074972461438185,
    "colsample_bytree": 0.7,
}
params_logistic = {
    "C": 3.6348739977385867,
    "max_iter": 5000,
    "penalty": "l2",
    "solver": "newton-cg",
}
xgb_model = XGBClassifier(objective="binary:logistic", **params_xgb)
cat_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    verbose=0,  # Suppress training output
    random_seed=42,
    **params_catboost,
)
log_reg = LogisticRegression(**params_logistic)

In [14]:
stacking_model = StackingClassifier(
    estimators=[("cat", cat_model)],
    final_estimator=xgb_model,
    cv=5,  # Use 5-fold cross-validation
)
stacking_model.fit(X_train, y_train)

In [15]:
# Evaluate on the test set
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Stacking Classifier Accuracy:", accuracy)

Stacking Classifier Accuracy: 0.9378109452736318


In [20]:
stacking_model = StackingClassifier(
    estimators=[("cat", cat_model)],
    final_estimator=log_reg,
    cv=5,  # Use 5-fold cross-validation
)
stacking_model.fit(X_train, y_train)
# Evaluate on the test set
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Stacking Classifier Accuracy:", accuracy)

Stacking Classifier Accuracy: 0.9379175550817341


In [21]:
stacking_model = StackingClassifier(
    estimators=[("xgb", xgb_model)],
    final_estimator=cat_model,
    cv=5,  # Use 5-fold cross-validation
)
stacking_model.fit(X_train, y_train)
# Evaluate on the test set
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Stacking Classifier Accuracy:", accuracy)

Stacking Classifier Accuracy: 0.9376687988628287


In [22]:
stacking_model = StackingClassifier(
    estimators=[("xgb", xgb_model)],
    final_estimator=log_reg,
    cv=5,  # Use 5-fold cross-validation
)
stacking_model.fit(X_train, y_train)
# Evaluate on the test set
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Stacking Classifier Accuracy:", accuracy)

Stacking Classifier Accuracy: 0.9377398720682303
