In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

In [16]:
data = pd.read_csv("../data/cleaned_v1.csv")
data.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Depression,Gender_Female,Gender_Male,Age_Group,City_encoded,Student,Profession_encoded,suicidal_thoughts,family_history
0,5.0,-1.0,2.0,8.0,0.0,2.0,1.0,2.0,0,1.0,0.0,3,0.19269,0,0.048567,0,0
1,4.0,-1.0,3.0,5.0,2.0,2.0,7.0,3.0,1,0.0,1.0,0,0.142206,0,0.055649,1,0
2,5.0,9.0,2.0,6.0,0.0,2.0,3.0,1.0,1,0.0,1.0,1,0.159196,1,0.585061,1,0
3,5.0,-1.0,1.0,5.0,1.0,2.0,10.0,1.0,1,0.0,1.0,0,0.131293,0,0.055649,1,1
4,1.0,-1.0,1.0,6.0,2.0,2.0,9.0,4.0,0,1.0,0.0,1,0.125739,0,0.05661,1,1


In [17]:
## removing Gender_Male
data.drop(["Gender_Male"], axis=1, inplace=True)

In [18]:
y = data["Depression"]
X = data.drop(["Depression"], axis=1)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [20]:
X_train.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Gender_Female,Age_Group,City_encoded,Student,Profession_encoded,suicidal_thoughts,family_history
33707,1.0,-1.0,5.0,8.0,2.0,2.0,4.0,2.0,0.0,4,0.1756,0,0.055649,0,1
133592,5.0,-1.0,4.0,5.0,0.0,3.0,11.0,4.0,0.0,4,0.242248,0,0.055649,0,1
113169,4.0,-1.0,3.0,8.0,1.0,3.0,3.0,1.0,0.0,3,0.214019,0,0.055649,0,1
42896,1.0,-1.0,1.0,8.0,2.0,2.0,8.0,5.0,0.0,0,0.144928,0,0.055649,1,0
82457,4.0,-1.0,5.0,6.0,1.0,1.0,0.0,2.0,0.0,2,0.134045,0,0.375086,0,0


In [21]:
def run_models(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(
            use_label_encoder=False, eval_metric="logloss", random_state=42
        ),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
        "Naive Bayes": GaussianNB(),
        "Dummy Classifier": DummyClassifier(strategy="most_frequent"),
    }

    results = {}

    for name, model in models.items():
        try:
            print(str("Currently training: " + name))
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            results[name] = accuracy
            print(str("Model " + name + f" has a accuracy of {accuracy}"))
        except Exception as e:
            results[name] = f"Error: {e}"

    return results

In [22]:
#results = run_models(X_train, X_test, y_train, y_test)
#print(results)

Hyperparameter tuning for:
- Logistic
- Random Forest
- XGBoost

In [23]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9355366027007818


In [24]:
xgboost = XGBClassifier(se_label_encoder=False, eval_metric="logloss", random_state=42)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9357498223169864


In [25]:
random_for = RandomForestClassifier(n_estimators=100, random_state=42)
random_for.fit(X_train, y_train)
y_pred = random_for.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9340796019900498


In [26]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    if y_prob is not None:
        print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob)}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)} ")


In [27]:
print("Evaluation report for Logistic Regression:\n")
evaluate_model(log_reg, X_test, y_test)

Evaluation report for Logistic Regression:

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     22986
           1       0.84      0.80      0.82      5154

    accuracy                           0.94     28140
   macro avg       0.90      0.88      0.89     28140
weighted avg       0.93      0.94      0.93     28140

ROC-AUC Score: 0.9738835732745625
Confusion Matrix:
[[22200   786]
 [ 1028  4126]]
Accuracy score: 0.9355366027007818 


In [28]:
print("Evaluation report for XGBoost:\n")
evaluate_model(xgboost, X_test, y_test)

Evaluation report for XGBoost:

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     22986
           1       0.83      0.81      0.82      5154

    accuracy                           0.94     28140
   macro avg       0.89      0.89      0.89     28140
weighted avg       0.94      0.94      0.94     28140

ROC-AUC Score: 0.9738361983493453
Confusion Matrix:
[[22135   851]
 [  957  4197]]
Accuracy score: 0.9357498223169864 


In [29]:
print("Evaluation report for Random Forest:\n")
evaluate_model(random_for, X_test, y_test)

Evaluation report for Random Forest:

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     22986
           1       0.84      0.79      0.82      5154

    accuracy                           0.93     28140
   macro avg       0.90      0.88      0.89     28140
weighted avg       0.93      0.93      0.93     28140

ROC-AUC Score: 0.9714651139407257
Confusion Matrix:
[[22198   788]
 [ 1067  4087]]
Accuracy score: 0.9340796019900498 


Should try to reduce False Negatives, which is very high (close to 1000 for each model), since the data is imbalanced.

In [30]:
y_train.value_counts()

Depression
0    92147
1    20413
Name: count, dtype: int64

In [31]:
y_test.value_counts()

Depression
0    22986
1     5154
Name: count, dtype: int64

Tuning Logistic Regression

In [32]:
from scipy.stats import uniform, loguniform
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distribution
param_dist = {
    "C": loguniform(1e-4, 1e4),
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "solver": ["lbfgs", "newton-cg", "liblinear", "sag", "saga"],
    "max_iter": [100, 1000, 2500, 5000],
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    LogisticRegression(),
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings that are sampled
    cv=5,
    random_state=42,
    n_jobs=-1,  # Use all available cores
)

In [33]:
#random_search.fit(X_train, y_train)

In [34]:
#best_params = random_search.best_params_
#best_model = random_search.best_estimator_

#accuracy = best_model.score(X_test, y_test)
#print(f"Best parameters: {best_params}")
#print(f"Accuracy: {accuracy:.3f}")

In [35]:
best_params = {
    "C": 3.6348739977385867,
    "max_iter": 5000,
    "penalty": "l2",
    "solver": "newton-cg",
}
log_reg_tuned = LogisticRegression(**best_params)
log_reg_tuned.fit(X_train, y_train)
evaluate_model(log_reg_tuned, X_test, y_test)

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     22986
           1       0.84      0.80      0.82      5154

    accuracy                           0.94     28140
   macro avg       0.90      0.88      0.89     28140
weighted avg       0.93      0.94      0.94     28140

ROC-AUC Score: 0.9738905286310667
Confusion Matrix:
[[22201   785]
 [ 1026  4128]]
Accuracy score: 0.9356432125088842 


Tuning XGBoost

In [36]:
param_dist = {
    "n_estimators": [100, 500, 900, 1100, 1500],
    "max_depth": [2, 3, 5, 10, 15],
    "learning_rate": [0.05, 0.1, 0.15, 0.20],
    "min_child_weight": [1, 2, 3, 4],
    "subsample": np.arange(0.5, 1.0, 0.1),
    "colsample_bytree": np.arange(0.5, 1.0, 0.1),
}
xgb_clf = XGBClassifier(objective="binary:logistic", tree_method="hist")

random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=25,  # Number of parameter settings sampled
    scoring="accuracy",
    cv=5,  # Number of cross-validation folds
    verbose=1,
    n_jobs=-1,  # Use all available cores
    random_state=42,
)
# random_search.fit(X_train, y_train)

In [37]:
#best_params = random_search.best_params_
#best_model = random_search.best_estimator_

In [38]:
#y_pred = best_model.predict(X_test)
#accuracy = accuracy_score(y_test, y_pred)
#print(f"Best parameters: {best_params}")
#print(f"Accuracy on test set: {accuracy}")

In [39]:
best_params = {
    "subsample": 0.5,
    "n_estimators": 500,
    "min_child_weight": 3,
    "max_depth": 2,
    "learning_rate": 0.1,
    "colsample_bytree": 0.7,
}
xgboost_tuned = XGBClassifier(
    objective="binary:logistic", tree_method="hist", **best_params
)
xgboost_tuned.fit(X_train, y_train)
evaluate_model(xgboost_tuned, X_test, y_test)

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     22986
           1       0.85      0.81      0.83      5154

    accuracy                           0.94     28140
   macro avg       0.90      0.89      0.90     28140
weighted avg       0.94      0.94      0.94     28140

ROC-AUC Score: 0.9758697707072189
Confusion Matrix:
[[22221   765]
 [  973  4181]]
Accuracy score: 0.9382373845060412 


In [40]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [41]:
from catboost import CatBoostClassifier

In [43]:
X.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Gender_Female,Age_Group,City_encoded,Student,Profession_encoded,suicidal_thoughts,family_history
0,5.0,-1.0,2.0,8.0,0.0,2.0,1.0,2.0,1.0,3,0.19269,0,0.048567,0,0
1,4.0,-1.0,3.0,5.0,2.0,2.0,7.0,3.0,0.0,0,0.142206,0,0.055649,1,0
2,5.0,9.0,2.0,6.0,0.0,2.0,3.0,1.0,0.0,1,0.159196,1,0.585061,1,0
3,5.0,-1.0,1.0,5.0,1.0,2.0,10.0,1.0,0.0,0,0.131293,0,0.055649,1,1
4,1.0,-1.0,1.0,6.0,2.0,2.0,9.0,4.0,1.0,1,0.125739,0,0.05661,1,1


In [46]:
accuracy_scores = []
for train_index, val_index in cv.split(X, y):
    # Split the data into train and validation sets
    X_strat_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_strat_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Initialize CatBoost Classifier
    cat_model = CatBoostClassifier(
        iterations=500,  # Number of boosting iterations
        learning_rate=0.1,  # Learning rate
        depth=6,  # Tree depth
        loss_function="Logloss",  # Loss function for binary classification
        eval_metric="Accuracy",  # Evaluation metric
        verbose=0,  # Suppress training output
        random_seed=42,
    )

    # Fit the model
    cat_model.fit(X_strat_train, y_strat_train, eval_set=(X_val, y_val), use_best_model=True)

    # Make predictions
    y_pred = cat_model.predict(X_val)

    # Calculate accuracy
    acc = accuracy_score(y_val, y_pred)
    accuracy_scores.append(acc)
    print(f"Fold Accuracy: {acc}")

Fold Accuracy: 0.9378109452736318
Fold Accuracy: 0.9377043354655294
Fold Accuracy: 0.9387348969438521
Fold Accuracy: 0.9401918976545842
Fold Accuracy: 0.9388415067519545


In [47]:
mean_accuracy = np.mean(accuracy_scores)
print(f"\nMean Accuracy Across {5} Folds: {mean_accuracy}")


Mean Accuracy Across 5 Folds: 0.9386567164179104


In [48]:
# Define the parameter grid
param_grid = {
    "iterations": [100, 300, 500, 700, 1000],
    "depth": [4, 6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "l2_leaf_reg": [1, 3, 5, 7, 9],  # Regularization parameter
    "border_count": [32, 64, 128],  # Number of splits for numeric features
    "bagging_temperature": [0, 1, 2, 3],  # Used for controlling overfitting
    "random_strength": [1, 5, 10],  # Noise to randomize splits
}

# Initialize CatBoostClassifier
cat_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    verbose=0,  # Suppress training output
    random_seed=42,
)

# Set up StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of combinations to try
    scoring="accuracy",  # Optimize for accuracy
    cv=skf,  # Cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1,  # Use all available processors
)

In [49]:
# Fit RandomizedSearchCV
random_search.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [50]:
# Best parameters and score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Accuracy: {random_search.best_score_}")

Best Parameters: {'random_strength': 5, 'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 300, 'depth': 4, 'border_count': 64, 'bagging_temperature': 0}
Best Accuracy: 0.9382729211087419


In [51]:
params = {
    "random_strength": 5,
    "learning_rate": 0.1,
    "l2_leaf_reg": 1,
    "iterations": 300,
    "depth": 4,
    "border_count": 64,
    "bagging_temperature": 0,
}

In [52]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
models = []
accuracy_scores = []
for train_index, val_index in skf.split(X, y):
    # Split the data into train and validation sets
    X_strat_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_strat_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Initialize CatBoost Classifier
    cat_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy",
    verbose=0,  # Suppress training output
    random_seed=42,
    **params
)

    # Fit the model
    cat_model.fit(X_strat_train, y_strat_train, eval_set=(X_val, y_val), use_best_model=True)

    models.append(cat_model)

    # Make predictions
    y_pred = cat_model.predict(X_val)

    # Calculate accuracy
    acc = accuracy_score(y_val, y_pred)
    accuracy_scores.append(acc)
    print(f"Fold Accuracy: {acc}")

Fold Accuracy: 0.9380952380952381
Fold Accuracy: 0.9374200426439232
Fold Accuracy: 0.938272921108742
Fold Accuracy: 0.9401918976545842
Fold Accuracy: 0.9383795309168443


In [53]:
mean_accuracy = np.mean(accuracy_scores)
print(f"\nMean Accuracy Across {5} Folds: {mean_accuracy}")


Mean Accuracy Across 5 Folds: 0.9384719260838663
