In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [34]:
data = pd.read_csv("../../data/test.csv")
data.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,140700,Shivam,Male,53.0,Visakhapatnam,Working Professional,Judge,,2.0,,,5.0,Less than 5 hours,Moderate,LLB,No,9.0,3.0,Yes
1,140701,Sanya,Female,58.0,Kolkata,Working Professional,Educational Consultant,,2.0,,,4.0,Less than 5 hours,Moderate,B.Ed,No,6.0,4.0,No
2,140702,Yash,Male,53.0,Jaipur,Working Professional,Teacher,,4.0,,,1.0,7-8 hours,Moderate,B.Arch,Yes,12.0,4.0,No
3,140703,Nalini,Female,23.0,Rajkot,Student,,5.0,,6.84,1.0,,More than 8 hours,Moderate,BSc,Yes,10.0,4.0,No
4,140704,Shaurya,Male,47.0,Kalyan,Working Professional,Teacher,,5.0,,,5.0,7-8 hours,Moderate,BCA,Yes,3.0,4.0,No


In [35]:
## Gender Column
one_hot_encoder = OneHotEncoder(sparse_output=False)
gender_encoded = one_hot_encoder.fit_transform(data[['Gender']])
df_encoded = pd.DataFrame(
    gender_encoded, columns=one_hot_encoder.get_feature_names_out(["Gender"])
)
data = pd.concat([data, df_encoded], axis=1)

In [36]:
## Age column
def cat_age(age):
    if age >= 18 and age <= 26.4:
        return 0
    elif age > 26.4 and age <= 34.8:
        return 1
    elif age > 34.8 and age <= 43.2:
        return 2
    elif age > 43.2 and age <= 51.6:
        return 3
    return 4

data["Age"] = data["Age"].apply(cat_age)

In [37]:
## Creating Student column 
data["Student"] = data[
    "Working Professional or Student"
].map({"Working Professional": 0, "Student": 1})

In [38]:
## Work Pressure and Academic Pressure
data["Work Pressure"].fillna(data["Academic Pressure"], inplace=True)
data["Work Pressure"].fillna(data["Work Pressure"].median(), inplace=True)

In [39]:
## Job Satisfaction and Study Satisfaction
data["Job Satisfaction"].fillna(data["Study Satisfaction"], inplace=True)
median_satisfaction = data["Job Satisfaction"].median()
data["Job Satisfaction"].fillna(median_satisfaction, inplace=True)

In [40]:
## CGPA
data["CGPA"].fillna(-1, inplace=True)
data["CGPA"] = np.round(data["CGPA"])

In [41]:
## Creating suicidal_thoughts
suicidal_thoughts = {"No": 0, "Yes": 1}
data["suicidal_thoughts"] = data["Have you ever had suicidal thoughts ?"].map(
    suicidal_thoughts
)

In [42]:
## Creating family_history
family_history = {"No": 0, "Yes": 1}
data["family_history"] = data["Family History of Mental Illness"].map(
    family_history
)

In [43]:
## Cleaning City column
data.loc[data["City"] == "Malyan", "City"] = "Kalyan"
data.loc[data["City"] == "Mhopal", "City"] = "Bhopal"
data.loc[data["City"] == "Thani", "City"] = "Thane"
data.loc[data["City"] == "Is Kanpur", "City"] = "Kanpur"
data.loc[data["City"] == "Golkata", "City"] = "Kolkata"
data.loc[data["City"] == "Less Delhi", "City"] = "Delhi"
data.loc[data["City"] == "San Vasai-Virar", "City"] = "Vasai-Virar"
data.loc[data["City"] == "More Delhi", "City"] = "Delhi"
data.loc[data["City"] == "Rolkata", "City"] = "Kolkata"
data.loc[data["City"] == "Ghopal", "City"] = "Bhopal"

others = [
    "Pratyush",
    "Vidya",
    "Less than 5 hours",
    "Aditi",
    "Keshav",
    "Nalini",
    "Avni",
    "Ira",
    "Vaishnavi",
    "Bhavna",
    "Lawyer",
    "Hrithik",
    "City",
    "Unaly",
    "Sara",
    "Saurav",
    "Vikram",
    "Parth",
    "Siddhesh",
    "Vaikot",
    "Leela",
    "Chemist",
    "No",
    "Saanvi",
    "Pratham",
    "Vidhi",
    "Abhinav",
    "No.12",
]
for name in others:
    data.loc[data["City"] == name, "City"] = "Others"

In [44]:
city_encoded = pd.read_json("../data/city_encoded.json", typ="series")

In [45]:
data["City_encoded"] = data["City"].map(city_encoded)

In [46]:
## Profession column
data.loc[
    (data["Profession"].isnull()) & (data["Student"] == 1),
    "Profession",
] = "Student"
data.loc[data["Profession"].isnull(), "Profession"] = "Unemployed"

others = [
    "Unhealthy",
    "B.Ed",
    "Working Professional",
    "3M",
    "ME",
    "B.Pharm",
    "24th",
    "Manvi",
    "Yogesh",
    "Samar",
    "Surat",
    "PhD",
    "M.Ed",
    "MD",
    "Name",
    "MCA",
    "Simran",
    "Profession",
    "BBA",
    "M.Tech",
    "LLM",
    "No",
    "Unveil",
    "City Consultant",
    "M.Pharm",
]
data.loc[data["Profession"].isin(others), "Profession"] = "Others"
data.loc[data["Profession"] == "Analyst", "Profession"] = "Business Analyst"
data.loc[data["Profession"] == "Surgeon", "Profession"] = "Doctor"
profession_encoded = pd.read_json("../data/profession_encoded.json", typ="series")
data["Profession_encoded"] = data["Profession"].map(profession_encoded)
data["Profession_encoded"].fillna(data["Profession_encoded"].median(), inplace=True)

In [47]:
sleep_duration = {
    "Less than 5 hours": 5,
    "7-8 hours": 8,
    "More than 8 hours": 8,
    "60-65 hours": 8,
    "1-6 hours": 5,
    "5-6 hours": 6,
    "9-5": 7,
    "9-5 hours": 7,
    "6-7 hours": 7,
    "3-4 hours": 5,
    "8-9 hours": 8,
    "4-5 hours": 5,
    "9-6 hours": 8,
    "1-2 hours": 5,
    "8-89 hours": 8,
    "6-8 hours": 7,
    "4-6 hours": 5,
    "than 5 hours": 5,
    "20-21 hours": 5,
    "10-6 hours": 8,
    "1-3 hours": 5,
    "6 hours": 6,
    "50-75 hours": 8,
    "2-3 hours": 5,
    "9-11 hours": 8,
    "9-10 hours": 8,
    "3-6 hours": 5,
}
data["Sleep Duration"] = data["Sleep Duration"].map(sleep_duration)
data["Sleep Duration"].fillna(8, inplace=True)

In [48]:
dietary_habits = {
    "Unhealthy": 2,
    "Moderate": 1,
    "Healthy": 0,
    "More Healthy": 0,
    "No Healthy": 2,
    "Less Healthy": 2,
    "Less than Healthy": 1,
    "No": 2,
    "5 Healthy": 0,
    "5 Unhealthy": 2,
}
data["Dietary Habits"] = data["Dietary Habits"].map(dietary_habits)
data["Dietary Habits"].fillna(1, inplace=True)

In [49]:
bachelors = data[data["Degree"].str.startswith("B", na=False)]["Degree"].unique()
masters = data[data["Degree"].str.startswith("M", na=False)]["Degree"].unique()
data["Degree"] = data["Degree"].replace(bachelors, "bachelors")
data["Degree"] = data["Degree"].replace(masters, "masters")
degrees = {
    "bachelors": "bachelors",
    "masters": "masters",
    "PhD": "PhD",
    "LLB": "bachelors",
    "Class 12": "high_school",
    "LLM": "masters",
    "A.Ed": "bachelors",
    "S.Pharm": "bachelors",
    "I.Ed": "bachelors",
    "GCA": "bachelors",
    "G.Ed": "bachelors",
    "RCA": "bachelors",
    "PCA": "bachelors",
    "J.Ed": "bachelors",
    "K.Ed": "bachelors",
}
data["Degree"] = data["Degree"].map(degrees)
data["Degree"].fillna("Others", inplace=True)
degree_map = {"others": 0, "high_school": 1, "bachelors": 2, "masters": 3, "PhD": 4}
data["Degree"] = data["Degree"].map(degree_map)
data["Degree"].fillna(1, inplace=True)

In [50]:
## Dropping the extra columns
cols_to_drop = [
    "id",
    "Name",
    "Gender",
    "Age",
    "City",
    "Working Professional or Student",
    "Profession",
    "Academic Pressure",
    "Study Satisfaction",
    "Have you ever had suicidal thoughts ?",
    "Family History of Mental Illness",
    "Gender_Male",
]

data.drop(cols_to_drop, axis=1, inplace=True)

In [51]:
data.head()

Unnamed: 0,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Work/Study Hours,Financial Stress,Gender_Female,Student,suicidal_thoughts,family_history,City_encoded,Profession_encoded
0,2.0,-1.0,5.0,5.0,1.0,2.0,9.0,3.0,0.0,0,0,1,0.159196,0.108645
1,2.0,-1.0,4.0,5.0,1.0,2.0,6.0,4.0,1.0,0,0,0,0.173577,0.074684
2,4.0,-1.0,1.0,8.0,1.0,2.0,12.0,4.0,0.0,0,1,0,0.181377,0.055649
3,5.0,7.0,1.0,8.0,1.0,2.0,10.0,4.0,1.0,1,1,0,0.173228,0.585061
4,5.0,-1.0,5.0,8.0,1.0,2.0,3.0,4.0,0.0,0,1,0,0.199636,0.055649
