In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [7]:
data = pd.read_csv("StudentDepression_update_dataset.csv")
data.head()

Unnamed: 0,Gender,Age,City,Profession,Academic Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,Visakhapatnam,Student,5.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,Female,24.0,Bangalore,Student,2.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,Male,31.0,Srinagar,Student,3.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,Female,28.0,Varanasi,Student,3.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,Female,25.0,Jaipur,Student,4.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 27901 non-null  object 
 1   Age                                    27901 non-null  float64
 2   City                                   27901 non-null  object 
 3   Profession                             27901 non-null  object 
 4   Academic Pressure                      27901 non-null  float64
 5   CGPA                                   27901 non-null  float64
 6   Study Satisfaction                     27901 non-null  float64
 7   Job Satisfaction                       27901 non-null  float64
 8   Sleep Duration                         27901 non-null  object 
 9   Dietary Habits                         27901 non-null  object 
 10  Degree                                 27901 non-null  object 
 11  Ha

In [None]:
X = data.drop('Depression', axis=1) ## select all columns accepet depression col
y = data['Depression'] ## select only depression column

In [10]:
X.columns

Index(['Gender', 'Age', 'City', 'Profession', 'Academic Pressure', 'CGPA',
       'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
       'Work/Study Hours', 'Financial Stress',
       'Family History of Mental Illness'],
      dtype='object')

In [None]:
numeric_cols = X.select_dtypes(exclude='object').columns.to_list() ## exclude object data type columns and save only int and float datatype columns
categoric_cols = X.select_dtypes(include='object').columns.to_list() ## only safe object dtype columns

## Import Required Libraries

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")), ## fill nan values with median value
    ("minmax", MinMaxScaler()) ## scale the features
])

categoric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')), ## fill missing values with mode
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) ## encode categorical features
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
    ("num", numeric_pipe, numeric_cols), ## given a name then pipeline load after that columns will go to pipeline
    ("cat", categoric_pipe, categoric_cols)  ##same as numerical columns work for categorical columns
    ])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [22]:
## dictionary of models to train and compare
models = {
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "XG Boost": XGBClassifier(),
    "Gradient Boost": GradientBoostingClassifier(),
    "Ada Boost": AdaBoostClassifier()
}

In [23]:
## train each model using the preprocessing pipeline
for name, model in models.items():
    print(f"\n===== Training {name} =====")

    ## a pipeline combining preprocessing and the model
    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("classifier", model)
    ])

    ## train the model
    pipeline.fit(X_train, y_train)

    ## predictions for training and test sets
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    ## training performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred)

    ## test performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred)

    print("\nModel performance for Training Set")
    print(f" - Accuracy Score:{model_train_accuracy:.4f}")
    print(f" - Precision Score:{model_train_precision:.4f}")
    print(f" - Recall Score:{model_train_recall:.4f}")
    print(f" - F1 Score:{model_train_f1:.4f}")

    print("\nModel performance for Test Set")
    print(f" - Accuracy Score:{model_test_accuracy:.4f}")
    print(f" - Precision Score:{model_test_precision:.4f}")
    print(f" - Recall Score:{model_test_recall:.4f}")
    print(f" - F1 Score:{model_test_f1:.4f}")

    print("=" * 40)


===== Training DecisionTree =====

Model performance for Training Set
 - Accuracy Score:1.0000
 - Precision Score:1.0000
 - Recall Score:1.0000
 - F1 Score:1.0000

Model performance for Test Set
 - Accuracy Score:0.7703
 - Precision Score:0.8062
 - Recall Score:0.7952
 - F1 Score:0.8007

===== Training RandomForest =====

Model performance for Training Set
 - Accuracy Score:1.0000
 - Precision Score:1.0000
 - Recall Score:1.0000
 - F1 Score:1.0000

Model performance for Test Set
 - Accuracy Score:0.8248
 - Precision Score:0.8381
 - Recall Score:0.8650
 - F1 Score:0.8514

===== Training XG Boost =====

Model performance for Training Set
 - Accuracy Score:0.9207
 - Precision Score:0.9223
 - Recall Score:0.9445
 - F1 Score:0.9333

Model performance for Test Set
 - Accuracy Score:0.8335
 - Precision Score:0.8489
 - Recall Score:0.8675
 - F1 Score:0.8581

===== Training Gradient Boost =====

Model performance for Training Set
 - Accuracy Score:0.8544
 - Precision Score:0.8641
 - Recall Sco