## DATA 780: Final Project Bagging

Features:
* ID: Unique identifier for each student
* Demographics: Age, Gender, City
* Academic Indicators: CGPA, Academic Pressure, Study Satisfaction
* Lifestyle & Wellbeing: Sleep Duration, Dietary Habits, Work Pressure, Job Satisfaction, Work/Study Hours
* Additional Factors: Profession, Degree, Financial Stress, Family History of Mental Illness, and whether the student has ever had suicidal thoughts


Target Variable:
* Depression_Status: A binary indicator (0/1 or Yes/No) that denotes whether a student is experiencing depression

In [39]:
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
data = pd.read_csv('student_depression_dataset.csv')

data.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [34]:
# Drop columns too granular
data = data.drop(columns=['id', 'City'], errors='ignore')


In [35]:
# Summary of survival by sex
# use pd.crosstab()
contingency = pd.crosstab(data["Depression"], data["Gender"])
print(contingency)

Gender      Female  Male
Depression              
0             5133  6432
1             7221  9115


## Feature Engineering

In [36]:
# Convert sleep duration text to integer
def sleep_duration(s):
    s = str(s).strip().replace("","")
    if 'Less than' in s:
        return 4.5
    elif 'More than' in s:
        return 9
    nums = [float(x) for x in s.replace('hours', '').split('-') if s.strip().isdigit()]
    return np.mean(nums) if nums else np.nan

if 'Sleep Duration' in data.columns:
    data['Sleep Duration'] = data['Sleep Duration'].apply(sleep_duration)

In [37]:
# Check
data['Sleep Duration']

0        NaN
1        NaN
2        4.5
3        NaN
4        NaN
        ... 
27896    NaN
27897    4.5
27898    NaN
27899    4.5
27900    4.5
Name: Sleep Duration, Length: 27901, dtype: float64

In [38]:
# Encode all categorcal variables
# use LabelEncoder()
cat_col = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in cat_col:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [None]:
# Handle NaNs
data.fillna(data.median(numeric_only=True), inplace=True)


## Split Data into Training and Testing Sets

In [None]:
# Assign Columns to feature and target variables
X = data.drop('Depression', axis = 1) # Features
y = data['Depression'] # Target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 99)

In [None]:
# Scale Features
scaler = StandardScaler()
X_train = train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Bagging Classifier with Decision Tree estimator
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(max_depth=6),
    n_estimators = 100,
    random_state = 99
)

In [None]:
# Fit the Model
bagging.fit(X_train, y_train)

In [None]:
# Predict
y_pred = bagging.predict(X_test)

## Bagging Classifier Evaluation

In [None]:
print("Classification Report:", classification_report(y_test, y_pred))
print("\nConfusion Matrix:", confusion_matrix(y_test, y_pred))

## Feature Importance

In [None]:
# Using SHAP Interpreter
explainer = shap.Explainer(bagging.estimators_[0], X_train, feature_names = X.columns)

shap.summary_plot(shapvalues, X_test, show=True)