In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv("../data/cleaned_v1.csv")
data.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Depression,Gender_Female,Gender_Male,Age_Group,City_encoded,Student,Profession_encoded,suicidal_thoughts,family_history
0,5.0,-1.0,2.0,8.0,0.0,2.0,1.0,2.0,0,1.0,0.0,3,0.19269,0,0.048567,0,0
1,4.0,-1.0,3.0,5.0,2.0,2.0,7.0,3.0,1,0.0,1.0,0,0.142206,0,0.055649,1,0
2,5.0,9.0,2.0,6.0,0.0,2.0,3.0,1.0,1,0.0,1.0,1,0.159196,1,0.585061,1,0
3,5.0,-1.0,1.0,5.0,1.0,2.0,10.0,1.0,1,0.0,1.0,0,0.131293,0,0.055649,1,1
4,1.0,-1.0,1.0,6.0,2.0,2.0,9.0,4.0,0,1.0,0.0,1,0.125739,0,0.05661,1,1


In [4]:
## removing Gender_Male
data.drop(["Gender_Male"], axis=1, inplace=True)

In [5]:
y = data["Depression"]
X = data.drop(["Depression"], axis=1)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
X_train.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Gender_Female,Age_Group,City_encoded,Student,Profession_encoded,suicidal_thoughts,family_history
33707,1.0,-1.0,5.0,8.0,2.0,2.0,4.0,2.0,0.0,4,0.1756,0,0.055649,0,1
133592,5.0,-1.0,4.0,5.0,0.0,3.0,11.0,4.0,0.0,4,0.242248,0,0.055649,0,1
113169,4.0,-1.0,3.0,8.0,1.0,3.0,3.0,1.0,0.0,3,0.214019,0,0.055649,0,1
42896,1.0,-1.0,1.0,8.0,2.0,2.0,8.0,5.0,0.0,0,0.144928,0,0.055649,1,0
82457,4.0,-1.0,5.0,6.0,1.0,1.0,0.0,2.0,0.0,2,0.134045,0,0.375086,0,0


In [17]:
def run_models(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(
            use_label_encoder=False, eval_metric="logloss", random_state=42
        ),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
        "Naive Bayes": GaussianNB(),
        "Dummy Classifier": DummyClassifier(strategy="most_frequent"),
    }

    results = {}

    for name, model in models.items():
        try:
            print(str("Currently training: " + name))
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            results[name] = accuracy
            print(str("Model " + name + f" has a accuracy of {accuracy}"))
        except Exception as e:
            results[name] = f"Error: {e}"

    return results

In [18]:
results = run_models(X_train, X_test, y_train, y_test)
print(results)

Currently training: Logistic Regression
Model Logistic Regression has a accuracy of 0.9355366027007818
Currently training: Decision Tree
Model Decision Tree has a accuracy of 0.9249466950959488
Currently training: Random Forest
Model Random Forest has a accuracy of 0.9340796019900498
Currently training: XGBoost
Model XGBoost has a accuracy of 0.9357498223169864
Currently training: K-Nearest Neighbors
Model K-Nearest Neighbors has a accuracy of 0.9264392324093816
Currently training: Naive Bayes
Model Naive Bayes has a accuracy of 0.8708955223880597
Currently training: Dummy Classifier
Model Dummy Classifier has a accuracy of 0.8168443496801706
{'Logistic Regression': 0.9355366027007818, 'Decision Tree': 0.9249466950959488, 'Random Forest': 0.9340796019900498, 'XGBoost': 0.9357498223169864, 'K-Nearest Neighbors': 0.9264392324093816, 'Naive Bayes': 0.8708955223880597, 'Dummy Classifier': 0.8168443496801706}


Hyperparameter tuning for:
- Logistic
- Random Forest
- XGBoost

In [19]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9355366027007818


In [20]:
xgboost = XGBClassifier(se_label_encoder=False, eval_metric="logloss", random_state=42)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9357498223169864


In [21]:
random_for = RandomForestClassifier(n_estimators=100, random_state=42)
random_for.fit(X_train, y_train)
y_pred = random_for.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9340796019900498


In [22]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    if y_prob is not None:
        print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob)}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


In [24]:
print("Evaluation report for Logistic Regression:\n")
evaluate_model(log_reg, X_test, y_test)

Evaluation report for Logistic Regression:

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     22986
           1       0.84      0.80      0.82      5154

    accuracy                           0.94     28140
   macro avg       0.90      0.88      0.89     28140
weighted avg       0.93      0.94      0.93     28140

ROC-AUC Score: 0.9738835732745625
Confusion Matrix:
[[22200   786]
 [ 1028  4126]]


In [25]:
print("Evaluation report for XGBoost:\n")
evaluate_model(xgboost, X_test, y_test)

Evaluation report for XGBoost:

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     22986
           1       0.83      0.81      0.82      5154

    accuracy                           0.94     28140
   macro avg       0.89      0.89      0.89     28140
weighted avg       0.94      0.94      0.94     28140

ROC-AUC Score: 0.9738361983493453
Confusion Matrix:
[[22135   851]
 [  957  4197]]


In [26]:
print("Evaluation report for Random Forest:\n")
evaluate_model(random_for, X_test, y_test)

Evaluation report for Random Forest:

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     22986
           1       0.84      0.79      0.82      5154

    accuracy                           0.93     28140
   macro avg       0.90      0.88      0.89     28140
weighted avg       0.93      0.93      0.93     28140

ROC-AUC Score: 0.9714651139407257
Confusion Matrix:
[[22198   788]
 [ 1067  4087]]
