In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("../../data/train.csv")
data.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [3]:
data.isnull().sum()

id                                            0
Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

In [6]:
## Cleaning City column
names_to_change = [
    "Vidhi",
    "Ayush",
    "Krishna",
    "Aishwarya",
    "Keshav",
    "Harsha",
    "Nalini",
    "Aditya",
    "Malyansh",
    "Raghavendra",
    "Saanvi",
    "M.Tech",
    "Bhavna",
    "Nandini",
    "M.Com",
    "Plata",
    "Atharv",
    "Pratyush",
    "City",
    "3.0",
    "MCA",
    "Mira",
    "Moreadhyay",
    "Morena",
    "Ishkarsh",
    "Kashk",
    "Mihir",
    "Vidya",
    "Anvi",
    "Krinda",
    "Ayansh",
    "Shrey",
    "Ivaan",
    "Vaanya",
    "Gaurav",
    "Harsh",
    "Reyansh",
    "Kashish",
    "Kibara",
    "Vaishnavi",
    "Chhavi",
    "Parth",
    "Mahi",
    "Tushar",
    "MSc",
    "No",
    "Rashi",
    "ME",
    "Researcher",
    "Kagan",
    "Armaan",
    "Ithal",
    "Nalyan",
    "Dhruv",
    "Galesabad",
    "Itheg",
    "Aaradhya",
    "Pooja",
    "Khushi",
    "Jhanvi",
    "Unirar",
]


def name_to_others(lst):
    for name in lst:
        data.loc[data["City"] == name, "City"] = "Others"


name_to_others(names_to_change)

wrong_spelling = ["Less Delhi", "Less than 5 Kalyan", "Tolkata", "Molkata", "Khaziabad"]

for name in wrong_spelling:
    if name == "Less Delhi":
        data.loc[data["City"] == name, "City"] = "Delhi"
    elif name == "Less than 5 Kalyan":
        data.loc[data["City"] == name, "City"] = "Kalyan"
    elif name == "Tolkata" or name == "Molkata":
        data.loc[data["City"] == name, "City"] = "Kolkata"
    else:
        data.loc[data["City"] == name, "City"] = "Ghaziabad"

data.loc[data["City"] == "Ishanabad", "City"] = "Others"
data.loc[data["City"] == "Gurgaon", "City"] = "Others"

In [8]:
## Replace all null values in "Profession" who are Student as "Student"
data.loc[
    (data["Profession"].isnull()) & (data["Working Professional or Student"] == "Student"),
    "Profession",
] = "Student"

## Replace the rest of the null values as "Unemployed"
data.loc[data["Profession"].isnull(), "Profession"] = "Unemployed"

## Replacing the Professions with similar names to the more common names
data.loc[data["Profession"] == "Finanancial Analyst", "Profession"] = (
    "Financial Analyst"
)
data.loc[data["Profession"] == "Dev", "Profession"] = "Software Engineer"
data.loc[data["Profession"] == "City Manager", "Profession"] = "Manager"
data.loc[data["Profession"] == "Analyst", "Profession"] = "Business Analyst"
data.loc[data["Profession"] == "Medical Doctor", "Profession"] = "Doctor"

## Replacing the other incorrect values with "Others"
profession_to_change = [
    "B.Com",
    "BE",
    "Yogesh",
    "MBA",
    "LLM",
    "BCA",
    "Academic",
    "Profession",
    "FamilyVirar",
    "BBA",
    "Working Professional",
    "MBBS",
    "Patna",
    "Unveil",
    "B.Ed",
    "Nagpur",
    "Moderate",
    "M.Ed",
    "Pranav",
    "Visakhapatnam",
    "PhD",
    "Yuvraj",
    "Family Consultant",
]
data.loc[data["Profession"].isin(profession_to_change), "Profession"] = "Others"

In [9]:
data["Work Pressure"].fillna(data["Academic Pressure"], inplace=True)

## Replacing the rest of null values with median Pressure
median_pressure = data["Work Pressure"].median()
data["Work Pressure"].fillna(median_pressure, inplace=True)

## Doing the same process for Study Satisfaction and Job Satisfaction
data["Job Satisfaction"].fillna(data["Study Satisfaction"], inplace=True)
median_satisfaction = data["Job Satisfaction"].median()
data["Job Satisfaction"].fillna(median_satisfaction, inplace=True)

data["CGPA"] = np.round(data["CGPA"])

In [10]:
## Cleainig Sleep Duration to make only 4 categories - 5, 6, 7, 8
## Less than 5 is 5 and more than 8 is 8
sleep_duration = {
    "More than 8 hours": 8,
    "5-6 hours": 6,
    "7-8 hours": 8,
    "1-2 hours": 5,  ## less than 5
    "6-8 hours": 7,
    "4-6 hours": 5,
    "6-7 hours": 7,
    "10-11 hours": 8,  ## more than 8
    "8-9 hours": 8,  ## more than 8
    "40-45 hours": 6,  ## weekly
    "9-11 hours": 8,  ## more than 8
    "2-3 hours": 5,  ## less than 5
    "3-4 hours": 5,  ## less than 5
    "Moderate": 7,
    "55-66 hours": 8,  ## weekly
    "4-5 hours": 5,  ## less than 5
    "9-6 hours": 8,
    "1-3 hours": 5,  ## less than 5
    "45": 6,
    "1-6 hours": 6,
    "35-36 hours": 5,  ## less than 5
    "8 hours": 8,
    "10-6 hours": 8,
    "than 5 hours": 5,
    "49 hours": 7,  ## weekly
    "3-6 hours": 5,
    "45-48 hours": 7,  ## weekly
    "9-5": 7,
    "9-5 hours": 7,
    "Less than 5 hours": 5,
}
data["Sleep Duration"] = data["Sleep Duration"].map(sleep_duration)

## Replacing null values with mode
sleep_mode = data["Sleep Duration"].mode()[0]
data["Sleep Duration"].fillna(sleep_mode, inplace=True)

In [11]:
dietary_habits = {
    "Unhealthy": "Unhealthy",
    "Moderate": "Moderate",
    "Healthy": "Healthy",
    "More Healthy": "Healthy",
    "No Healthy": "Unhealthy",
    "Less Healthy": "Unhealthy",
    "Less than Healthy": "Moderate",
}
data["Dietary Habits"] = data["Dietary Habits"].map(dietary_habits)

## Replacing null values with Moderate = 1
data["Dietary Habits"].fillna("Moderate", inplace=True)

In [12]:
data["Financial Stress"].fillna(data["Financial Stress"].median(), inplace=True)

In [14]:
## Combining all degrees starting with 'B' to bachelors and 'M' with masters
bachelors = data[data["Degree"].str.startswith("B", na=False)]["Degree"].unique()
masters = data[data["Degree"].str.startswith("M", na=False)]["Degree"].unique()
data["Degree"] = data["Degree"].replace(bachelors, "bachelors")
data["Degree"] = data["Degree"].replace(masters, "masters")

## Replacing all the other values to their relevant categories
degrees = {
    "bachelors": "bachelors",
    "masters": "masters",
    "PhD": "PhD",
    "LLB": "bachelors",
    "Class 12": "high_school",
    "LLM": "masters",
    "LL.Com": "masters",
    "LLCom": "masters",
    "LLTech": "bachelors",
    "LL B.Ed": "bachelors",
    "Doctor": "PhD",
    "N.Pharm": "masters",
}
data["Degree"] = data["Degree"].map(degrees)

## Replacing the null values to "Others"
data["Degree"].fillna("Others", inplace=True)

## Replacing the final null values with median
data["Degree"].fillna("high_school", inplace=True)

In [15]:
student_data = data[data["Working Professional or Student"] == "Student"]
student_data.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,Student,5.0,5.0,9.0,2.0,2.0,6.0,Healthy,bachelors,Yes,3.0,1.0,No,1
8,8,Aishwarya,Female,24.0,Bangalore,Student,Student,2.0,2.0,6.0,5.0,5.0,6.0,Moderate,bachelors,No,3.0,2.0,Yes,0
26,26,Aditya,Male,31.0,Srinagar,Student,Student,3.0,3.0,7.0,5.0,5.0,5.0,Healthy,bachelors,No,9.0,1.0,Yes,0
30,30,Prisha,Female,28.0,Varanasi,Student,Student,3.0,3.0,6.0,2.0,2.0,8.0,Moderate,bachelors,Yes,4.0,5.0,Yes,1
32,32,Chhavi,Female,25.0,Jaipur,Student,Student,4.0,4.0,8.0,3.0,3.0,6.0,Moderate,masters,Yes,1.0,1.0,No,0
