In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

## 1. Load clean data

In [2]:
# Load clean data
df = pd.read_csv('clean_depression_dataset.csv')

print(f'Dataset shape: {df.shape}')
print(f'\nColumns: {df.columns.tolist()}')
print(f'\nData types:\n{df.dtypes}')
print(f'\nMissing values:\n{df.isnull().sum()}')
print(f'\nFirst few rows:')
df.head()

Dataset shape: (157862, 18)

Columns: ['Gender', 'Age', 'City', 'Working Professional or Student', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Depression']

Data types:
Gender                                    object
Age                                        int64
City                                      object
Working Professional or Student           object
Profession                                object
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                             int64
Dietary Habits                           float64
Degree               

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,53,Lucknow,Working Professional,Medical/Health,0.0,5.0,0.0,0.0,4.0,0,0.0,Medical/Pharma,1,11.0,3.0,1,0
1,Male,48,Mumbai,Working Professional,Education/Research,0.0,2.0,0.0,0.0,5.0,0,0.0,Medical/Pharma,1,8.0,3.0,1,0
2,Male,24,Srinagar,Student,Student,2.0,0.0,7.09,3.0,0.0,1,1.0,Arts/Law/Education,0,7.0,4.0,1,0
3,Male,37,Patna,Working Professional,Service/Operations,0.0,4.0,0.0,0.0,3.0,2,2.0,Arts/Law/Education,0,7.0,5.0,0,0
4,Female,36,Varanasi,Working Professional,Business/Finance,0.0,1.0,0.0,0.0,4.0,1,2.0,Arts/Law/Education,1,9.0,2.0,1,0


In [3]:
binary_cols = ['Depression', 
               'Have you ever had suicidal thoughts ?',
               'Family History of Mental Illness']

numeric_cols = ['Age', 'CGPA', 'Work/Study Hours']

categorical_cols = ['Gender', 'Working Professional or Student',
                    'Profession', 'Degree']
high_card_cols = ['City']

In [4]:
X = df.drop('Depression', axis=1)
y = df['Depression']

In [5]:
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE)

## 2. Numeric features scaling

Scale numerical columns using StandardScaler 

- Age range 1-100

- CGPA range 5-10

- Work/ Study Hours 0-12

In [6]:
def scale_data(X_train, X_test, numeric_cols):
    scaler = StandardScaler()
    
    X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [7]:
scale_data(X_train, X_test, numeric_cols)

## 3. Create new features and interactions

Total_Stress = Academic Pressure + Work Pressure + Financial Stress:  Stress tổng hợp, # 0-5 scale

Health_Score = Sleep Duration + Dietary Habits: # 0-1 scale

Life_Satisfaction = Study Satisfaction + Job Satisfaction # 0-1 scale

Mental_Risk = Total_Stress – ( Health_Score + Life_Satisfaction)

Study_Burden = Academic Pressure × Work/Study Hours: áp lực học + nhiều giờ học = overload

Work_Burden = Work Pressure × Work/Study Hours: giống Study_Burden nhưng cho nhóm working professionals

Sleep_Stress = (8 – Sleep Duration) × Total_Stress:  Thiếu ngủ khuếch đại ảnh hưởng của stress, Sleep càng kém (0 or 2) × Stress càng cao = Impact càng lớn

Suicide_Stress = Have you ever had suicidal thought? x Total_Stress

Burnout_Risk = Total_Stress (>75%) + Poor_Sleep (Sleep Duration <5) + Overwork (Work/Study Hours > 75%)

In [8]:
stress_threshold = None
hours_threshold = None

def build_features(df, is_train = True):
    global stress_threshold, hours_threshold
    
    # 1. Total_Stress 
    df['Total_Stress'] = ( 
        df['Academic Pressure'] 
        + df['Work Pressure'] 
        + df['Financial Stress'] 
        ) / 3

    # 2. Health Score
    df['Health_Score'] = (
        df['Sleep Duration'] + 
        df['Dietary Habits']
    ) / 4  
    
    # 3. Life Satisfaction
    df['Life_Satisfaction'] = (
        df['Study Satisfaction'] + 
        df['Job Satisfaction']
    ) / 10  
    
    # 4. Mental Risk = Total_Stress – (Health_Score + Life_Satisfaction )
    df['Mental_Risk'] = df['Total_Stress'] / 5 - (
        df['Health_Score'] + df['Life_Satisfaction']
    ) / 2

    # 5. Study_Burden = Academic Pressure × Work/Study Hours 
    df['Study_Burden'] = np.where(
        df['Working Professional or Student'] == 'Student',
        df['Academic Pressure'] * df['Work/Study Hours'],
        0
    )

    # 6. Work_Burden = Work Pressure × Work/Study Hours 
    df['Work_Burden'] = np.where(
        df['Working Professional or Student'] == 'Working Professional',
        df['Work Pressure'] * df['Work/Study Hours'],
        0
    ) 

    # 7. Sleep_Stress 
    # Sleep Quality Score (U-shaped: 0 và 2 đều bad, 1 là good)
    df['Sleep_Quality'] = df['Sleep Duration'].map({0: 0, 1: 1, 2: 0.3})

    df['Sleep_Stress'] = (1 - df['Sleep_Quality']) * df['Total_Stress']

    # 8. Suicide_Stress = Suicidal thought × Total_Stress 
    df['Suicide_Stress'] = df['Have you ever had suicidal thoughts ?'] * df['Total_Stress']

    # 9. Burnout_Risk
    if is_train:
        # Tính thresholds từ train set
        stress_threshold = df['Total_Stress'].quantile(0.75)
        hours_threshold = df['Work/Study Hours'].quantile(0.75)
        print(f"[Train] Stress threshold: {stress_threshold:.3f}")
        print(f"[Train] Hours threshold: {hours_threshold:.3f}")
    else:
        # Test set dùng thresholds từ train
        if stress_threshold is None or hours_threshold is None:
            raise ValueError("Must run on train set first to compute thresholds!")
        print(f"[Test] Using train thresholds: stress={stress_threshold:.3f}, hours={hours_threshold:.3f}") 
    
    df['Burnout_Risk'] = (
    (df['Total_Stress'] > stress_threshold).astype(int) +
    (df['Sleep Duration'] == 0).astype(int) +  # Poor sleep
    (df['Work/Study Hours'] > hours_threshold).astype(int)
    ) 
    


In [9]:
build_features(X_train, is_train=True)
build_features(X_test, is_train=False)

[Train] Stress threshold: 2.333
[Train] Hours threshold: 0.972
[Test] Using train thresholds: stress=2.333, hours=0.972


## 4.  Categorical feature encoding

Categorical columns:

- Gender → Male / Female / Other (3 giá trị, có thứ tự KHÔNG rõ ràng)

- Working Professional or Student → Worker / Student (2 giá trị)

- Profession → 11 giá trị, không theo thứ tự

- Degree → 7 giá trị, không theo thứ tự

=> Using OneHotEncoder

- City -> Nhiều giá trị, ko có thứ tự

=> Using Frequency encoding

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_maps = {}


    def fit(self, X, y=None):
        for col in X.columns:
            self.freq_maps[col] = X[col].value_counts(normalize=True).to_dict()
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X_transformed.columns:
            X_transformed[col] = X_transformed[col].map(self.freq_maps[col]).fillna(0)
        return X_transformed


In [11]:
def encode_categorical_features(X_train, X_test, categorical_cols, high_card_cols):

    categorical_pipeline = Pipeline([
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ])

    high_card_pipeline = Pipeline([
        ('freq', FrequencyEncoder())
    ])

    preprocessor = ColumnTransformer([
        ('low_card', categorical_pipeline, categorical_cols),
        ('high_card', high_card_pipeline, high_card_cols)
    ], remainder='passthrough')

    X_train_encoded = preprocessor.fit_transform(X_train)
    X_test_encoded = preprocessor.transform(X_test)

    onehot_cols = preprocessor.named_transformers_['low_card']['onehot'].get_feature_names_out(categorical_cols)

    # features = one-hot + high-card cols + numerical cols (passed through)
    passthrough_cols = [
        col for col in X_train.columns 
        if col not in categorical_cols + high_card_cols
    ]

    feature_names = list(onehot_cols) + list(high_card_cols) + passthrough_cols
    
    X_train_new = pd.DataFrame(X_train_encoded, columns=feature_names, index=X_train.index)
    X_test_new = pd.DataFrame(X_test_encoded, columns=feature_names, index=X_test.index)
    
    return X_train_new, X_test_new


In [12]:
X_train, X_test = encode_categorical_features(X_train, X_test, categorical_cols, high_card_cols)
X_train.head()

Unnamed: 0,Gender_Female,Gender_Male,Gender_Other,Working Professional or Student_Student,Working Professional or Student_Working Professional,Profession_Business/Finance,Profession_Creative/Media,Profession_Education/Research,Profession_Engineering/Architecture,Profession_Legal,...,Total_Stress,Health_Score,Life_Satisfaction,Mental_Risk,Study_Burden,Work_Burden,Sleep_Quality,Sleep_Stress,Suicide_Stress,Burnout_Risk
37802,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.333333,0.25,0.2,0.041667,0.0,-1.630888,0.0,1.333333,0.0,1.0
36771,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.666667,0.5,0.4,-0.116667,0.0,2.464362,0.3,1.166667,0.0,1.0
99816,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.666667,0.0,0.4,0.133333,0.0,0.711623,0.0,1.666667,1.666667,1.0
140737,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,2.0,0.0,0.5,0.15,0.0,-4.892663,0.0,2.0,0.0,1.0
25102,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.333333,0.25,0.5,0.091667,0.0,-2.550152,1.0,0.0,0.0,0.0


## 5. Feature selection

Select features by using:

- Pearson correlation: rank features by absolute linear correlation with target.

- Mutual Information: rank features by non-linear dependency with target.

- VIF: check multicollinearity among features.

- LassoCV: embedded feature selection via regularized linear regression coefficients.

- RandomForest importance: rank features by contribution to tree-based model predictions.

- Aggregate ranking: combine ranks from all methods to get mean rank.

Select top features: choose top 25 features based on aggregated ranking.


In [13]:
# VIF helper (uses statsmodels if available)
def compute_vif(df, features):
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    X = df[features].assign(constant=1.0)
    vif_data = {f: variance_inflation_factor(X.values, i) for i, f in enumerate(X.columns) if f != 'constant'}
    return pd.Series(vif_data).sort_values(ascending=False)

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Pearson correlation with target (absolute)
corrs = X_train.corrwith(y_train).abs().sort_values(ascending=False)
print('\nTop 15 features by abs(correlation) with target:')
print(corrs.head(15))

# Mutual Information (captures non-linear relationships)
mi = mutual_info_regression(X_train.fillna(0), y_train)
mi_series = pd.Series(mi, index=X_train.columns).sort_values(ascending=False)
print('\nTop 15 features by mutual information:')
print(mi_series.head(15))

# VIF (optional - requires statsmodels)
vif = compute_vif(X_train.fillna(0), X_train.columns.tolist())
if vif is not None:
    print('\nTop VIF values:')
    print(vif.head(15))

# LassoCV for embedded selection
lasso = LassoCV(cv=5, random_state=42, n_jobs=-1)
lasso.fit(X_train.fillna(0), y_train)
coef_abs = pd.Series(np.abs(lasso.coef_), index=X_train.columns).sort_values(ascending=False)
print('\nTop 15 features by absolute Lasso coefficients:')
print(coef_abs.head(15))

# RandomForest importance
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train.fillna(0), y_train)
rf_imp = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print('\nTop 15 features by RandomForest importance:')
print(rf_imp.head(15))

# Aggregate a simple ranking (average rank across methods)
rank_df = pd.DataFrame({
    'corr_rank': corrs.rank(ascending=False),
    'mi_rank': mi_series.rank(ascending=False),
    'lasso_rank': coef_abs.rank(ascending=False),
    'rf_rank': rf_imp.rank(ascending=False)
}).fillna(1e6) # missing treated as low importance

rank_df['mean_rank'] = rank_df.mean(axis=1)
ranked = rank_df.sort_values('mean_rank')

print('\nTop 20 features by aggregated mean rank:')
print(ranked.head(20))

# Save selected top features
selected_features = ranked.head(25).index.tolist()
print('\nSelected features (top 25):', selected_features)


Top 15 features by abs(correlation) with target:
Academic Pressure                                       0.588278
Age                                                     0.569466
Profession_Student                                      0.522620
Working Professional or Student_Student                 0.522620
Working Professional or Student_Working Professional    0.522620
CGPA                                                    0.513262
Suicide_Stress                                          0.464831
Job Satisfaction                                        0.435850
Study Satisfaction                                      0.418486
Mental_Risk                                             0.357863
Have you ever had suicidal thoughts ?                   0.351987
Total_Stress                                            0.342124
Degree_High School                                      0.287802
Work Pressure                                           0.259423
Burnout_Risk                            

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f6527928-bbf4-4c9a-8760-1f30ce0e5ba3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>