### 1. Import Dependencies

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import OrdinalEncoder

### 2. Load Data

In [33]:
train_df = pd.read_csv('../Data/Processed/DP_train_missing_values_hanlded.csv')
test_df = pd.read_csv('../Data/Processed/DP_test_missing_values_hanlded.csv')

In [34]:
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        print(f"{col} - {train_df[col].nunique()} unique values")

Gender - 2 unique values
City - 30 unique values
Working Professional or Student - 2 unique values
Profession - 37 unique values
Sleep Duration - 4 unique values
Dietary Habits - 3 unique values
Degree - 27 unique values
Have you ever had suicidal thoughts ? - 2 unique values
Family History of Mental Illness - 2 unique values


In [35]:
train_df.columns.tolist()

['id',
 'Gender',
 'Age',
 'City',
 'Working Professional or Student',
 'Profession',
 'Academic Pressure',
 'Work Pressure',
 'CGPA',
 'Study Satisfaction',
 'Job Satisfaction',
 'Sleep Duration',
 'Dietary Habits',
 'Degree',
 'Have you ever had suicidal thoughts ?',
 'Work/Study Hours',
 'Financial Stress',
 'Family History of Mental Illness',
 'Depression']

### 3. Encoding Columns

#### 3.1 One-Hot Encoding

In [36]:
X_train = train_df.copy()
X_test  = test_df.copy()

In [37]:
ohe_cols = [
            'Gender',
            'Dietary Habits',
            'Have you ever had suicidal thoughts ?',
            'Family History of Mental Illness',
            'Working Professional or Student'
            ]

ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
ohe.fit(X_train[ohe_cols])

ohe_train = pd.DataFrame(
                        ohe.transform(X_train[ohe_cols]),
                        columns=ohe.get_feature_names_out(ohe_cols),
                        index=X_train.index
                        )


ohe_test = pd.DataFrame(
                        ohe.transform(X_test[ohe_cols]),
                        columns=ohe.get_feature_names_out(ohe_cols),
                        index=X_test.index
                        )

# replace originals with encoded
X_train = pd.concat([X_train.drop(columns=ohe_cols), ohe_train], axis=1)
X_test  = pd.concat([X_test.drop(columns=ohe_cols),  ohe_test],  axis=1)

#### 3.2 Ordinal Encoding

In [38]:
sleep_map = {'Less than 5 hours':0, '5-6 hours':1, '7-8 hours':2, 'More than 8 hours':3 }

X_train['Sleep Duration'] = X_train['Sleep Duration'].map(sleep_map)
X_test['Sleep Duration']  = X_test['Sleep Duration'].map(sleep_map)

In [39]:
X_test.head()

Unnamed: 0,id,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Degree,Work/Study Hours,Financial Stress,Gender_Male,Dietary Habits_Moderate,Dietary Habits_Unhealthy,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes,Working Professional or Student_Working Professional
0,140700,53.0,Visakhapatnam,Judge,2.0,2.0,7.77,3.0,5.0,0,LLB,9.0,3.0,1.0,1.0,0.0,0.0,1.0,1.0
1,140701,58.0,Kolkata,Educational Consultant,2.0,2.0,7.77,3.0,4.0,0,B.Ed,6.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0
2,140702,53.0,Jaipur,Teacher,2.0,4.0,7.77,3.0,1.0,2,B.Arch,12.0,4.0,1.0,1.0,0.0,1.0,0.0,1.0
3,140703,23.0,Rajkot,Not Specified,5.0,3.0,6.84,1.0,3.0,3,BSc,10.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0
4,140704,47.0,Kalyan,Teacher,2.0,5.0,7.77,3.0,5.0,2,BCA,3.0,4.0,1.0,1.0,0.0,1.0,0.0,1.0


### 3.3 Target Encoding

In [40]:
# Create mapping from X_train data
city_target_map = X_train.groupby("City")["Depression"].mean()

# Apply mapping to X_train and test
X_train["City_TE"] = X_train["City"].map(city_target_map)
X_test["City_TE"] = X_test["City"].map(city_target_map)


In [41]:
profession_target_map = X_train.groupby("Profession")["Depression"].mean()

X_train["Profession_TE"] = X_train["Profession"].map(profession_target_map)
X_test["Profession_TE"] = X_test["Profession"].map(profession_target_map)


In [42]:
degree_target_map = X_train.groupby("Degree")["Depression"].mean()

X_train["Degree_TE"] = X_train["Degree"].map(degree_target_map)
X_test["Degree_TE"] = X_test["Degree"].map(degree_target_map)


In [43]:
X_train.drop(["City", "Profession", "Degree"], axis=1, inplace=True)
X_test.drop(["City", "Profession", "Degree"], axis=1, inplace=True)

In [44]:
X_train.head()

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Work/Study Hours,Financial Stress,Depression,Gender_Male,Dietary Habits_Moderate,Dietary Habits_Unhealthy,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes,Working Professional or Student_Working Professional,City_TE,Profession_TE,Degree_TE
0,0,49.0,3.0,5.0,7.77,4.0,2.0,3,1.0,2.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.192838,0.048235,0.172751
1,1,26.0,3.0,4.0,7.77,3.0,3.0,0,7.0,3.0,1,1.0,0.0,1.0,1.0,0.0,1.0,0.142205,0.055707,0.150943
2,2,33.0,5.0,3.0,8.97,2.0,3.0,1,3.0,1.0,1,1.0,0.0,0.0,1.0,0.0,0.0,0.159095,0.534754,0.119426
3,3,22.0,2.0,5.0,7.77,3.0,1.0,0,10.0,1.0,1,1.0,1.0,0.0,1.0,1.0,1.0,0.131075,0.055707,0.134447
4,4,30.0,4.0,1.0,7.77,3.0,1.0,1,9.0,4.0,0,0.0,0.0,1.0,1.0,1.0,1.0,0.125768,0.056753,0.134447


In [45]:
X_test.head()

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Work/Study Hours,Financial Stress,Gender_Male,Dietary Habits_Moderate,Dietary Habits_Unhealthy,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes,Working Professional or Student_Working Professional,City_TE,Profession_TE,Degree_TE
0,140700,53.0,2.0,2.0,7.77,3.0,5.0,0,9.0,3.0,1.0,1.0,0.0,0.0,1.0,1.0,0.159095,0.108708,0.150943
1,140701,58.0,2.0,2.0,7.77,3.0,4.0,0,6.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.172972,0.074789,0.128337
2,140702,53.0,2.0,4.0,7.77,3.0,1.0,2,12.0,4.0,1.0,1.0,0.0,1.0,0.0,1.0,0.181545,0.055707,0.15772
3,140703,23.0,5.0,3.0,6.84,1.0,3.0,3,10.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.17301,0.534754,0.155409
4,140704,47.0,2.0,5.0,7.77,3.0,5.0,2,3.0,4.0,1.0,1.0,0.0,1.0,0.0,1.0,0.199514,0.055707,0.203839


In [46]:
X_train.to_csv('../Data/Processed/DP_train_encoded.csv', index=False)
X_test.to_csv('../Data/Processed/DP_test_encoded.csv', index=False)