In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [15]:
df = pd.read_csv("datasets/mental_health_remote_workers.csv")
print(df.head())

   Employee_ID    Name  Age             Gender    Country         Job_Role  \
0            1  User_1   28             Female     Canada      QA Engineer   
1            2  User_2   41  Prefer not to say  Australia  DevOps Engineer   
2            3  User_3   36               Male    Germany   UI/UX Designer   
3            4  User_4   32  Prefer not to say    Germany  Product Manager   
4            5  User_5   29               Male         UK  DevOps Engineer   

   Experience_Years Work_Mode  Hours_Worked_Per_Week  Productivity_Score  \
0               5.4    Onsite                   45.4                 6.6   
1              10.5    Remote                   40.4                 8.4   
2              11.9    Hybrid                   43.6                 6.6   
3              15.6    Hybrid                   41.3                 9.0   
4               1.8    Hybrid                   42.5                 6.9   

  Mental_Health_Status  Burnout_Score  Sleep_Hours_Per_Day Exercise_Freque

In [16]:
print(df.isna().any())

Employee_ID                         False
Name                                False
Age                                 False
Gender                              False
Country                             False
Job_Role                            False
Experience_Years                    False
Work_Mode                           False
Hours_Worked_Per_Week               False
Productivity_Score                  False
Mental_Health_Status                False
Burnout_Score                       False
Sleep_Hours_Per_Day                 False
Exercise_Frequency                  False
Work_Life_Balance_Rating            False
Has_Access_To_Therapist             False
Willing_To_Return_Onsite            False
Remote_Setup_Satisfaction           False
Internet_Issues_Frequency           False
Team_Communication_Effectiveness    False
dtype: bool


In [17]:
print(df['Country'].unique())
print(df['Job_Role'].unique())
print(df['Work_Mode'].unique())
print(df['Mental_Health_Status'].unique())
print(df['Exercise_Frequency'].unique())
print(df['Has_Access_To_Therapist'].unique())
print(df['Willing_To_Return_Onsite'].unique())
print(df['Internet_Issues_Frequency'].unique())


['Canada' 'Australia' 'Germany' 'UK' 'Brazil' 'India' 'USA']
['QA Engineer' 'DevOps Engineer' 'UI/UX Designer' 'Product Manager'
 'Data Scientist' 'Developer']
['Onsite' 'Remote' 'Hybrid']
['Poor' 'Good' 'Moderate']
['Rarely' 'Daily' 'Never' '2-3x/Week']
[ True False]
[False  True]
['Sometimes' 'Often' 'Never']


In [18]:
"""
Gender
Country (drop)
Job_Role
Work_Mode
Mental_Health_Status
Exercise_Frequency
Has_Access_To_Therapist
Willing_To_Return_Onsite
Internet_Issues_Frequency
"""

df['Gender'] = df['Gender'].map({'Male': 0, 
                                 'Female': 1,
                                 'Non-binary': 3,
                                 'Prefer not to say': 4})

df['Country'] = df['Country'].map({'Canada': 0,
                                   'Australia': 1,
                                   'Germany': 2,
                                   'UK': 3,
                                   'Brazil': 4,
                                   'India': 5,
                                    'USA': 6,
                                    })
"""
['Canada' 'Australia' 'Germany' 'UK' 'Brazil' 'India' 'USA']
['QA Engineer' 'DevOps Engineer' 'UI/UX Designer' 'Product Manager'
 'Data Scientist' 'Developer']
"""
df['Job_Role'] = df['Job_Role'].map({'QA Engineer': 0,
                                     'DevOps Engineer': 1,
                                     'UI/UX Designer': 2,
                                     'Product Manager': 3,
                                     'Data Scientist': 4,
                                     'Developer': 5})

df['Work_Mode'] = df['Work_Mode'].map({'Onsite': 0,
                                       'Remote': 1,
                                       'Hybrid': 2})

df['Mental_Health_Status'] = df['Mental_Health_Status'].map({'Poor': 0,
                                                             'Good': 1,
                                                             'Moderate': 2})
#['Rarely' 'Daily' 'Never' '2-3x/Week']
df['Exercise_Frequency'] = df['Exercise_Frequency'].map({'Rarely': 0,
                                                         'Daily': 1,
                                                         'Never': 2,
                                                         '2-3x/Week': 3})

df['Has_Access_To_Therapist'] = df['Has_Access_To_Therapist'].map({True: 0, False: 1})

df['Willing_To_Return_Onsite'] = df['Willing_To_Return_Onsite'].map({True: 0, False: 1})

df['Internet_Issues_Frequency'] = df['Internet_Issues_Frequency'].map({'Sometimes': 0,
                                                                       'Often': 1,
                                                                       'Never': 2})

In [19]:
print(df.head())

   Employee_ID    Name  Age  Gender  Country  Job_Role  Experience_Years  \
0            1  User_1   28       1        0         0               5.4   
1            2  User_2   41       4        1         1              10.5   
2            3  User_3   36       0        2         2              11.9   
3            4  User_4   32       4        2         3              15.6   
4            5  User_5   29       0        3         1               1.8   

   Work_Mode  Hours_Worked_Per_Week  Productivity_Score  Mental_Health_Status  \
0          0                   45.4                 6.6                     0   
1          1                   40.4                 8.4                     1   
2          2                   43.6                 6.6                     1   
3          2                   41.3                 9.0                     2   
4          2                   42.5                 6.9                     0   

   Burnout_Score  Sleep_Hours_Per_Day  Exercise_Frequenc

In [21]:
df = df.drop('Name',axis=1)
print(df.head())

   Employee_ID  Age  Gender  Country  Job_Role  Experience_Years  Work_Mode  \
0            1   28       1        0         0               5.4          0   
1            2   41       4        1         1              10.5          1   
2            3   36       0        2         2              11.9          2   
3            4   32       4        2         3              15.6          2   
4            5   29       0        3         1               1.8          2   

   Hours_Worked_Per_Week  Productivity_Score  Mental_Health_Status  \
0                   45.4                 6.6                     0   
1                   40.4                 8.4                     1   
2                   43.6                 6.6                     1   
3                   41.3                 9.0                     2   
4                   42.5                 6.9                     0   

   Burnout_Score  Sleep_Hours_Per_Day  Exercise_Frequency  \
0             44                  5.7      