# Import Required Libraries
Import the necessary libraries.

In [31]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load Data
Load the train and test data from the data directory

In [32]:
data_dir = '../data/'
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')

# Define Columns
Define the numerical, quantized, text, and categorical columns.

In [33]:
# Define numerical columns
numerical_columns = ["Age", "Work/Study Hours"]

# Define quantized columns
quantized_columns = ["CGPA"]

# Define text columns
text_columns = [
    "Name",
    "City",
    "Profession",
    "Sleep Duration",
    "Degree",
    "Dietary Habits",
]

# Define categorical columns and include text columns
categorical_columns = [
    "Gender",
    "Working Professional or Student",
    "Have you ever had suicidal thoughts ?",
    "Financial Stress",
    "Family History of Mental Illness",
    "Academic Pressure",
    "Work Pressure",
    "Study Satisfaction",
    "Job Satisfaction",
] + text_columns

# Clean Data
Drop duplicate rows, fill nan's

In [34]:
# Create copies of the train and test DataFrames for preprocessing
train_prep = train.copy()
test_prep = test.copy()

Remove duplicates

In [35]:
train = train.drop_duplicates()
test = test.drop_duplicates()

Drop unnescecary columns

In [36]:
# Drop 'id' column if it exists
if "id" in train_prep.columns:
    train_prep = train_prep.drop(columns="id")
if "id" in test_prep.columns:
    test_prep = test_prep.drop(columns="id")

Ensure that either Job Satisfaction or Study Satisfaction is filled.

In [37]:
# If at least one of them is filled, fill the other with zero.
# If both are filled, do nothing.
# If none are filled, fill both with zero.
train_prep['Job Satisfaction'] = train_prep['Job Satisfaction'].fillna(0)
train_prep['Study Satisfaction'] = train_prep['Study Satisfaction'].fillna(0)
test_prep['Job Satisfaction'] = test_prep['Job Satisfaction'].fillna(0)
test_prep['Study Satisfaction'] = test_prep['Study Satisfaction'].fillna(0)

Ensure that either Job Satisfaction or Study Satisfaction is filled.

In [38]:
# If at least one of them is filled, fill the other with zero.
# If both are filled, do nothing.
# If none are filled, fill both with zero.
train_prep['Academic Pressure'] = train_prep['Academic Pressure'].fillna(0)
train_prep['Work Pressure'] = train_prep['Work Pressure'].fillna(0)
test_prep['Academic Pressure'] = test_prep['Academic Pressure'].fillna(0)
test_prep['Work Pressure'] = test_prep['Work Pressure'].fillna(0)

Fill missing values in categorical columns with the mode or 'missing' if the mode is not available.

In [39]:
for col in categorical_columns:
    # Fill missing values with mode or 'missing' if mode is not available
    fill_value = train_prep[col].mode()[0] if not train_prep[col].mode().empty else "missing"
    train_prep[col] = train_prep[col].fillna(fill_value)
    test_prep[col] = test_prep[col].fillna(fill_value)


Fill missing values in numerical columns with the median.

In [40]:
for col in numerical_columns + quantized_columns:
    # Fill missing values with median
    median_value = train_prep[col].median()
    train_prep[col] = train_prep[col].fillna(median_value)
    test_prep[col] = test_prep[col].fillna(median_value)

Check that there are no missing values.

In [41]:
if train_prep.isnull().values.any():
    print("Missing values in train data")
    print(train_prep.isnull().sum())
    assert False

if test_prep.isnull().values.any():
    print("Missing values in test data")
    print(test_prep.isnull().sum())
    assert False

print("Cleaning completed successfully")

Cleaning completed successfully


# Modify Features
Create and modify features

[<span style="color:orange">Original Feature</span>] Age:

In [42]:
# Transform 'Age' to integer
train_prep["Age"] = train_prep["Age"].astype(int)
test_prep["Age"] = test_prep["Age"].astype(int)

[<span style="color:orange">Original Feature</span>] CGPA:  

**Description**: Quantize CGPA into bins.

In [43]:
train_prep["CGPA"] = pd.cut(
    train_prep["CGPA"],
    bins=3,
    labels=["Low", "Medium", "High"],
    duplicates="drop",
)
test_prep["CGPA"] = pd.cut(
    test_prep["CGPA"],
    bins=3,
    labels=["Low", "Medium", "High"],
    duplicates="drop",
)

# Move CGPA from quantized_columns to categorical_columns
categorical_columns.append("CGPA")
quantized_columns.remove("CGPA")

[<span style="color:blue">New Feature</span>] Pressure:  

**Description**: Addition of columns *'Academic Pressure'* and *'Work Pressure'*.  
**Assumption**: Either *'Academic Pressure'* or *'Work Pressure'* can be set.  
**Feature Range**: [0-5]


In [44]:
train_prep["Pressure"] = (
    train_prep["Academic Pressure"] + train_prep["Work Pressure"]
).clip(0, 5)

test_prep["Pressure"] = (
    test_prep["Academic Pressure"] + test_prep["Work Pressure"]
).clip(0, 5)

# Add new feature 'Pressure' to numerical columns
numerical_columns.append("Pressure")

[<span style="color:blue">New Feature</span>] Satisfaction:  

**Description**: Addition of columns *'Study Satisfaction'* and *'Job Satisfaction'*.  
**Assumption**: Either *'Study Satisfaction'* or *'Job Satisfaction'* can be set.  
**Feature Range**: [0-5]


In [45]:
train_prep["Satisfaction"] = (
    train_prep["Study Satisfaction"] + train_prep["Job Satisfaction"]
).clip(0, 5)

test_prep["Satisfaction"] = (
    test_prep["Study Satisfaction"] + test_prep["Job Satisfaction"]
).clip(0, 5)

# Add new feature 'Satisfaction' to numerical columns
numerical_columns.append("Satisfaction")

[<span style="color:blue">New Feature</span>] Pressure_Satisfaction:  

**Description**: Division of columns *'Pressure'* and *'Satisfaction'*.  
**Assumption**: Having high pressure and high satisfaction should reduce overall 'stress'. However having high stress and low satisfaction is a bad combination.


In [46]:
train_prep["Pressure_Satisfaction"] = (
    train_prep["Pressure"] / train_prep["Satisfaction"]
)
test_prep["Pressure_Satisfaction"] = test_prep["Pressure"] / test_prep["Satisfaction"]
# Add new feature 'Pressure_Satisfaction' to numerical columns
numerical_columns.append("Pressure_Satisfaction")

[<span style="color:blue">New Feature</span>] WorkHours_Pressure:  
[<span style="color:red">ToDo</span>] Need to find a good way to combine the features. And what is our assumption?

**Description**: Multiplication of columns *'Work/Study Hours'* and *'Pressure'*.  
**Assumption**: ?


In [47]:
train_prep["WorkHours_Pressure"] = (
    train_prep["Work/Study Hours"] * train_prep["Pressure"]
)
test_prep["WorkHours_Pressure"] = test_prep["Work/Study Hours"] * test_prep["Pressure"]
# Add new feature 'WorkHours_Pressure' to numerical columns
numerical_columns.append("WorkHours_Pressure")

[<span style="color:blue">New Feature</span>] Diet_Sleep:  
[<span style="color:red">ToDo</span>] Need to find a good way to combine the features.  

**Description**: Concatination of columns *'Dietary Habits'* and *'Sleep Duration'*.  
**Assumption**: Sleep and diet are essential parameters of an healthy lifestyle.


In [48]:
train_prep["Diet_Sleep"] = (
    train_prep["Dietary Habits"].astype(str)
    + "_"
    + train_prep["Sleep Duration"].astype(str)
)
test_prep["Diet_Sleep"] = (
    test_prep["Dietary Habits"].astype(str)
    + "_"
    + test_prep["Sleep Duration"].astype(str)
)
# Add new feature 'Diet_Sleep' to categorical columns
categorical_columns.append("Diet_Sleep")

[<span style="color:blue">New Feature</span>] FinancialStress_WorkHours:  
[<span style="color:red">ToDo</span>] Need to find a good way to combine the features. And what is our assumption?

**Description**: Concatination of columns *'Financial Stress'* and *'Work/Study Hours'*.  
**Assumption**: ?


In [49]:
train_prep["FinancialStress_WorkHours"] = (
    train_prep["Financial Stress"].astype(int) * train_prep["Work/Study Hours"]
)
test_prep["FinancialStress_WorkHours"] = (
    test_prep["Financial Stress"].astype(int) * test_prep["Work/Study Hours"]
)
# Add new feature 'FinancialStress_WorkHours' to numerical columns
numerical_columns.append("FinancialStress_WorkHours")

[<span style="color:blue">New Feature</span>] WorkHours_Satisfaction:  
[<span style="color:red">ToDo</span>] Is this assumption correct?

**Description**: Division of columns *'Work/Study Hours'* and *'Satisfaction'*.  
**Assumption**: Working long with high satisfaction should be less problematic then working long with low satisfaction.


In [50]:
train_prep["WorkHours_Satisfaction"] = (
    train_prep["Work/Study Hours"] / train_prep["Satisfaction"]
)
test_prep["WorkHours_Satisfaction"] = (
    test_prep["Work/Study Hours"] / test_prep["Satisfaction"]
)
# Add new feature 'WorkHours_Satisfaction' to numerical columns
numerical_columns.append("WorkHours_Satisfaction")

# Post Process
Encode columns and ensure correct datatypes

Encode categorical features using label encoder.

In [51]:
for col in categorical_columns:
    le = LabelEncoder()
    complete_data = pd.concat([train_prep[col], test_prep[col]], axis=0)
    le.fit(complete_data)
    train_prep[col] = le.transform(train_prep[col])
    test_prep[col] = le.transform(test_prep[col])
    train_prep[col] = train_prep[col].astype('category')
    test_prep[col] = test_prep[col].astype('category')

# Save data
Save processed data to a file.

In [53]:
pd.to_pickle(train_prep, data_dir + 'train_prep.pkl')
pd.to_pickle(test_prep, data_dir + 'test_prep.pkl')
pd.to_pickle(categorical_columns, data_dir + 'categorical_columns.pkl')
pd.to_pickle(numerical_columns, data_dir + 'numerical_columns.pkl')


print("Preprocessing completed successfully")

Preprocessing completed successfully
