In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import OneHotEncoder

In [3]:
data = pd.read_csv("../../data/train.csv")

In [4]:
data.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [5]:
## One-Hot Encoding for Gender Column
one_hot_encoder = OneHotEncoder(sparse_output=False)
gender_encoded = one_hot_encoder.fit_transform(data[["Gender"]])
df_encoded = pd.DataFrame(
    gender_encoded, columns=one_hot_encoder.get_feature_names_out(["Gender"])
)
data = pd.concat([data, df_encoded], axis=1)

In [6]:
## Age column to categorical variable
## Bins created - [(17.958, 26.4] < (26.4, 34.8] < (34.8, 43.2] < (43.2, 51.6] < (51.6, 60.0]]
data["Age_Group"] = pd.cut(data["Age"], 5, labels=[0, 1, 2, 3, 4])

In [7]:
## Cleaning City column
names_to_change = [
    "Vidhi",
    "Ayush",
    "Krishna",
    "Aishwarya",
    "Keshav",
    "Harsha",
    "Nalini",
    "Aditya",
    "Malyansh",
    "Raghavendra",
    "Saanvi",
    "M.Tech",
    "Bhavna",
    "Nandini",
    "M.Com",
    "Plata",
    "Atharv",
    "Pratyush",
    "City",
    "3.0",
    "MCA",
    "Mira",
    "Moreadhyay",
    "Morena",
    "Ishkarsh",
    "Kashk",
    "Mihir",
    "Vidya",
    "Anvi",
    "Krinda",
    "Ayansh",
    "Shrey",
    "Ivaan",
    "Vaanya",
    "Gaurav",
    "Harsh",
    "Reyansh",
    "Kashish",
    "Kibara",
    "Vaishnavi",
    "Chhavi",
    "Parth",
    "Mahi",
    "Tushar",
    "MSc",
    "No",
    "Rashi",
    "ME",
    "Researcher",
    "Kagan",
    "Armaan",
    "Ithal",
    "Nalyan",
    "Dhruv",
    "Galesabad",
    "Itheg",
    "Aaradhya",
    "Pooja",
    "Khushi",
    "Jhanvi",
    "Unirar",
]


def name_to_others(lst):
    for name in lst:
        data.loc[data["City"] == name, "City"] = "Others"


name_to_others(names_to_change)

wrong_spelling = ["Less Delhi", "Less than 5 Kalyan", "Tolkata", "Molkata", "Khaziabad"]

for name in wrong_spelling:
    if name == "Less Delhi":
        data.loc[data["City"] == name, "City"] = "Delhi"
    elif name == "Less than 5 Kalyan":
        data.loc[data["City"] == name, "City"] = "Kalyan"
    elif name == "Tolkata" or name == "Molkata":
        data.loc[data["City"] == name, "City"] = "Kolkata"
    else:
        data.loc[data["City"] == name, "City"] = "Ghaziabad"

data.loc[data["City"] == "Ishanabad", "City"] = "Others"
data.loc[data["City"] == "Gurgaon", "City"] = "Others"

## Mean encoding for the city column
city_mean_encoded = data.groupby("City")["Depression"].mean()
data["City_encoded"] = data["City"].map(city_mean_encoded)

In [9]:
import json

In [12]:
city_mean_encoded.to_json("../data/city_encoded.json", indent=1)

In [13]:
## Map Working Professional or Student column to 0 or 1
## Rename the column to "Student" (1 if is Student else 0)
data["Student"] = data[
    "Working Professional or Student"
].map({"Working Professional": 0, "Student": 1})

In [14]:
## Replace all null values in "Profession" who are Student as "Student"
data.loc[
    (data["Profession"].isnull()) & (data["Student"] == 1),
    "Profession",
] = "Student"

## Replace the rest of the null values as "Unemployed"
data.loc[data["Profession"].isnull(), "Profession"] = "Unemployed"

## Replacing the Professions with similar names to the more common names
data.loc[data["Profession"] == "Finanancial Analyst", "Profession"] = (
    "Financial Analyst"
)
data.loc[data["Profession"] == "Dev", "Profession"] = "Software Engineer"
data.loc[data["Profession"] == "City Manager", "Profession"] = "Manager"
data.loc[data["Profession"] == "Analyst", "Profession"] = "Business Analyst"
data.loc[data["Profession"] == "Medical Doctor", "Profession"] = "Doctor"

## Replacing the other incorrect values with "Others"
profession_to_change = [
    "B.Com",
    "BE",
    "Yogesh",
    "MBA",
    "LLM",
    "BCA",
    "Academic",
    "Profession",
    "FamilyVirar",
    "BBA",
    "Working Professional",
    "MBBS",
    "Patna",
    "Unveil",
    "B.Ed",
    "Nagpur",
    "Moderate",
    "M.Ed",
    "Pranav",
    "Visakhapatnam",
    "PhD",
    "Yuvraj",
    "Family Consultant",
]
data.loc[data["Profession"].isin(profession_to_change), "Profession"] = "Others"

## Performing mean encoding on the Profession column
profession_mean_encoded = data.groupby("Profession")["Depression"].mean()
data["Profession_encoded"] = data["Profession"].map(profession_mean_encoded)

In [15]:
profession_mean_encoded.to_json("../data/profession_encoded.json", indent=1)

In [10]:
## Combining Work Pressure with Academic Pressure
## Since majority of null value in Work Pressure are of those who are students and vice-versa
## therefore, we combine them
data["Work Pressure"].fillna(data["Academic Pressure"], inplace=True)

## Replacing the rest of null values with median Pressure
median_pressure = data["Work Pressure"].median()
data["Work Pressure"].fillna(median_pressure, inplace=True)

In [11]:
## Doing the same process for Study Satisfaction and Job Satisfaction
data["Job Satisfaction"].fillna(data["Study Satisfaction"], inplace=True)
median_satisfaction = data["Job Satisfaction"].median()
data["Job Satisfaction"].fillna(median_satisfaction, inplace=True)

In [12]:
## Replacing null values of CGPA with -1, since they are working professionals
data["CGPA"].fillna(-1, inplace=True)

data["CGPA"] = np.round(data["CGPA"])

In [13]:
## Cleainig Sleep Duration to make only 4 categories - 5, 6, 7, 8
## Less than 5 is 5 and more than 8 is 8
sleep_duration = {
    "More than 8 hours": 8,
    "5-6 hours": 6,
    "7-8 hours": 8,
    "1-2 hours": 5,  ## less than 5
    "6-8 hours": 7,
    "4-6 hours": 5,
    "6-7 hours": 7,
    "10-11 hours": 8,  ## more than 8
    "8-9 hours": 8,  ## more than 8
    "40-45 hours": 6,  ## weekly
    "9-11 hours": 8,  ## more than 8
    "2-3 hours": 5,  ## less than 5
    "3-4 hours": 5,  ## less than 5
    "Moderate": 7,
    "55-66 hours": 8,  ## weekly
    "4-5 hours": 5,  ## less than 5
    "9-6 hours": 8,
    "1-3 hours": 5,  ## less than 5
    "45": 6,
    "1-6 hours": 6,
    "35-36 hours": 5,  ## less than 5
    "8 hours": 8,
    "10-6 hours": 8,
    "than 5 hours": 5,
    "49 hours": 7,  ## weekly
    "3-6 hours": 5,
    "45-48 hours": 7,  ## weekly
    "9-5": 7,
    "9-5 hours": 7,
    "Less than 5 hours": 5,
}
data["Sleep Duration"] = data["Sleep Duration"].map(sleep_duration)

## Replacing null values with mode
sleep_mode = data["Sleep Duration"].mode()[0]
data["Sleep Duration"].fillna(sleep_mode, inplace=True)

In [14]:
## Cleaing Dietary Habits to 3 categories - 0, 1, 2, 2 being the unhealthy
dietary_habits = {
    "Unhealthy": 2,
    "Moderate": 1,
    "Healthy": 0,
    "More Healthy": 0,
    "No Healthy": 2,
    "Less Healthy": 2,
    "Less than Healthy": 1,
}
data["Dietary Habits"] = data["Dietary Habits"].map(dietary_habits)

## Replacing null values with Moderate = 1
data["Dietary Habits"].fillna(1, inplace=True)

In [15]:
## Mapping suicidal thoughts column to 0 or 1, 1 being Yes
suicidal_thoughts = {"No": 0, "Yes": 1}
data["suicidal_thoughts"] = data["Have you ever had suicidal thoughts ?"].map(
    suicidal_thoughts
)

In [16]:
## Replacing Family History column to 0 or 1, 1 being Yes
family_history = {"No": 0, "Yes": 1}
data["family_history"] = data["Family History of Mental Illness"].map(
    family_history
)

In [None]:
## Replacing null values in Financial Stress column with its median
data["Financial Stress"].fillna(data["Financial Stress"].median(), inplace=True)

In [17]:
## Combining all degrees starting with 'B' to bachelors and 'M' with masters
bachelors = data[data["Degree"].str.startswith("B", na=False)]["Degree"].unique()
masters = data[data["Degree"].str.startswith("M", na=False)]["Degree"].unique()
data["Degree"] = data["Degree"].replace(bachelors, "bachelors")
data["Degree"] = data["Degree"].replace(masters, "masters")

## Replacing all the other values to their relevant categories
degrees = {
    "bachelors": "bachelors",
    "masters": "masters",
    "PhD": "PhD",
    "LLB": "bachelors",
    "Class 12": "high_school",
    "LLM": "masters",
    "LL.Com": "masters",
    "LLCom": "masters",
    "LLTech": "bachelors",
    "LL B.Ed": "bachelors",
    "Doctor": "PhD",
    "N.Pharm": "masters",
}
data["Degree"] = data["Degree"].map(degrees)

## Replacing the null values to "Others"
data["Degree"].fillna("Others", inplace=True)

## Mapping the degrees to 4 categories
degree_map = {"others": 0, "high_school": 1, "bachelors": 2, "masters": 3, "PhD": 4}
data["Degree"] = data["Degree"].map(degree_map)

## Replacing the final null values with median
median_degree = data["Degree"].median()
data["Degree"].fillna(median_degree, inplace=True)

In [20]:
data.columns

Index(['id', 'Name', 'Gender', 'Age', 'City',
       'Working Professional or Student', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression',
       'Gender_Female', 'Gender_Male', 'Age_Group', 'City_encoded', 'Student',
       'Profession_encoded', 'suicidal_thoughts', 'family_history'],
      dtype='object')

In [21]:
## Dropping the extra columns
cols_to_drop = [
    "id",
    "Name",
    "Gender",
    "Age",
    "City",
    "Working Professional or Student",
    "Profession",
    "Academic Pressure",
    "Study Satisfaction",
    "Have you ever had suicidal thoughts ?",
    "Family History of Mental Illness",
    "Gender_Male",
]

data.drop(cols_to_drop, axis=1, inplace=True)

In [22]:
data.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Depression,Gender_Female,Age_Group,City_encoded,Student,Profession_encoded,suicidal_thoughts,family_history
0,5.0,-1.0,2.0,8.0,0.0,2.0,1.0,2.0,0,1.0,3,0.19269,0,0.048567,0,0
1,4.0,-1.0,3.0,5.0,2.0,2.0,7.0,3.0,1,0.0,0,0.142206,0,0.055649,1,0
2,5.0,9.0,2.0,6.0,0.0,2.0,3.0,1.0,1,0.0,1,0.159196,1,0.585061,1,0
3,5.0,-1.0,1.0,5.0,1.0,2.0,10.0,1.0,1,0.0,0,0.131293,0,0.055649,1,1
4,1.0,-1.0,1.0,6.0,2.0,2.0,9.0,4.0,0,1.0,1,0.125739,0,0.05661,1,1


In [23]:
## Saving the csv file
data.to_csv("../data/cleaned_train.csv", index=False)