In [1]:
# Core Libraries
import pandas as pd
import numpy as np
from scipy import stats
import random
import warnings

# Machine Learning Libraries
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, HistGradientBoostingClassifier

# Set random seed
rs = 42

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Ignore warnings
warnings.filterwarnings("ignore")

In [34]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [35]:
df_train.head(2)

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1


In [36]:
df_train = df_train.drop(["id"], axis=1)
df_test = df_test.drop(["id"], axis=1)

In [37]:
## Creating a interaction between age and work pressure
df_train["Age_WorkPressure"] = df_train["Age"] * df_train["Work Pressure"]
df_test["Age_WorkPressure"] = df_test["Age"] * df_test["Work Pressure"]

In [38]:
## Setting up target encoder for City and Profession columns
encoder = TargetEncoder(cols=["City", "Profession"])
df_train[["city_encoded", "profession_encoded"]] = encoder.fit_transform(
    df_train[["City", "Profession"]], df_train["Depression"]
)
df_test[["city_encoded", "profession_encoded"]] = encoder.transform(
    df_test[["City", "Profession"]]
)

In [39]:
## Get X and y from df_train
X_train = df_train.drop(["Depression"], axis=1)
y_train = df_train["Depression"]

In [40]:
X_train.dtypes

Name                                      object
Gender                                    object
Age                                      float64
City                                      object
Working Professional or Student           object
Profession                                object
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                            object
Dietary Habits                            object
Degree                                    object
Have you ever had suicidal thoughts ?     object
Work/Study Hours                         float64
Financial Stress                         float64
Family History of Mental Illness          object
Age_WorkPressure                         float64
city_encoded                             float64
profession_encoded  

In [41]:
## Get numerical and categorical columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.to_list()
cat_cols = X_train.select_dtypes(include="object").columns.to_list()

In [42]:
print(num_cols)
print("\n")
print(cat_cols)

['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Age_WorkPressure', 'city_encoded', 'profession_encoded']


['Name', 'Gender', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']


In [43]:
## Pre-processing pipeline for numerical columns
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("convert_to_float32", FunctionTransformer(lambda x: x.astype(np.float32))),
    ]
)

## Pre-processing pipeline for categorical columns
cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        (
            "ordinal",
            OrdinalEncoder(
                dtype=np.int32, handle_unknown="use_encoded_value", unknown_value=-1
            ),
        ),
    ]
)

## Combining the pipelines
preprocessor = ColumnTransformer(
    transformers=[("num", num_pipeline, num_cols), ("cat", cat_pipeline, cat_cols)]
)

In [44]:
## Apply the preprocessing pipeline
X_train_preprocessed = preprocessor.fit_transform(X_train)
test_preprocessed = preprocessor.transform(df_test)

In [45]:
print("First row:")
print(X_train_preprocessed[0])

First row:
[ 6.95360184e-01 -4.56981733e-02  1.58971357e+00  3.37837301e-02
  1.80130042e-02 -7.72517979e-01 -1.36305749e+00 -6.99616790e-01
  1.88296103e+00  3.28746676e-01 -6.31710649e-01  1.10000000e+01
  0.00000000e+00  5.00000000e+01  1.00000000e+00  1.00000000e+01
  2.90000000e+01  7.00000000e+00  3.30000000e+01  0.00000000e+00
  0.00000000e+00]


In [46]:
# Apply Isolation Forest for outlier detection on the training data
isolation_forest = IsolationForest(contamination=0.04, random_state=rs)
outlier_labels = isolation_forest.fit_predict(X_train_preprocessed)

# Filter out outliers from both X_train_preprocessed and y_train
non_outliers_mask = outlier_labels != -1
X_train_preprocessed = X_train_preprocessed[non_outliers_mask]
y_train = y_train[non_outliers_mask]

In [47]:
# Define parameters
xgb_params = {
    "learning_rate": 0.298913248058474,
    "max_depth": 9,
    "min_child_weight": 3,
    "n_estimators": 673,
    "subsample": 0.5933970249700855,
    "gamma": 2.597137534750985,
    "reg_lambda": 0.11328048420927406,
    "colsample_bytree": 0.1381203919800721,
}

catboost_params = {
    "iterations": 145,
    "depth": 7,
    "learning_rate": 0.29930179265937246,
    "l2_leaf_reg": 1.242352421942431,
    "random_strength": 8.325681754379957,
    "bagging_temperature": 0.7869848919618048,
    "border_count": 139,
}

hgb_params = {
    "learning_rate": 0.16299202834206894,
    "max_iter": 250,
    "max_depth": 4,
    "l2_regularization": 7.1826466833939895,
    "early_stopping": True,
}

In [48]:
xgb_model = XGBClassifier(**xgb_params, use_label_encoder=False, random_state=rs)
catboost_model = CatBoostClassifier(
    **catboost_params, random_state=rs, verbose=0
)
hgb_model = HistGradientBoostingClassifier(**hgb_params, random_state=rs)

In [49]:
## Stacking the models together
stacking_ensemble = StackingClassifier(
    estimators=[("catboost", catboost_model), ("xgb", xgb_model), ("hgb", hgb_model)],
    final_estimator=LogisticRegression(),
    passthrough=False,
)

In [50]:
scoring = make_scorer(accuracy_score)

cv_scores = cross_val_score(
    stacking_ensemble, X_train_preprocessed, y_train, cv=5, scoring=scoring
)

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.4f}")

Cross-Validation Scores: [0.94221729 0.943809   0.94328867 0.94369586 0.94573184]
Mean CV Accuracy: 0.9437
Standard Deviation of CV Accuracy: 0.0011
