# WellSAP

## Personal WellBeing Training

### Imports

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
import joblib

### Data Loading

In [3]:
hr_data = pd.read_csv('./../dataset/hr_data.csv',header=0, index_col='Employee_ID')
attrition_data = pd.read_csv('./../dataset/attrition_data.csv', header=0, index_col='Employee_ID')
lifestyle_data = pd.read_csv('./../dataset/lifestyle_data.csv', header=0, index_col='Employee_ID')

### Data Pre-Processing

#### Lifestyle Data Pre-Processing

In [4]:
input_features_classes = lifestyle_data.loc[:,['FRUITS_VEGGIES', 'DAILY_STRESS', 'BMI_RANGE', 'DONATION']]
input_features_int_int = lifestyle_data.loc[:,['PLACES_VISITED', 'CORE_CIRCLE', 'SUPPORTING_OTHERS', 'SOCIAL_NETWORK']]
input_features_int_float = lifestyle_data.loc[:,['SLEEP_HOURS', 'WEEKLY_MEDITATION', 'DAILY_SHOUTING']]

In [5]:
input_features_classes = input_features_classes.groupby('Employee_ID').max().astype(int)
input_features_int_int = input_features_int_int.groupby('Employee_ID').mean().apply(np.ceil).astype(int)
input_features_int_float = input_features_int_float.groupby('Employee_ID').mean()

In [6]:
input_features = pd.concat([input_features_classes, input_features_int_int, input_features_int_float], axis=1)
input_features

Unnamed: 0_level_0,FRUITS_VEGGIES,DAILY_STRESS,BMI_RANGE,DONATION,PLACES_VISITED,CORE_CIRCLE,SUPPORTING_OTHERS,SOCIAL_NETWORK,SLEEP_HOURS,WEEKLY_MEDITATION,DAILY_SHOUTING
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,5,5,2,5,6,7,5,7,7.00,7.28,2.08
2.0,5,5,2,5,5,5,5,7,6.76,6.52,3.88
3.0,5,5,2,5,6,6,6,8,6.92,5.52,3.16
4.0,5,5,2,5,7,6,6,7,7.52,6.20,2.00
5.0,5,5,2,5,5,6,5,6,7.64,5.92,2.92
...,...,...,...,...,...,...,...,...,...,...,...
307.0,5,4,2,5,6,6,5,6,7.12,6.00,1.76
308.0,5,5,2,5,5,6,7,7,6.92,5.80,3.08
309.0,5,5,2,5,5,7,8,7,6.80,4.68,3.08
310.0,5,5,2,5,5,7,7,7,7.08,6.12,3.52


#### HR and Attrition Data Pre-Processing

In [7]:
gender_encoder = preprocessing.LabelEncoder()
attrition_data['Gender'] = gender_encoder.fit_transform(attrition_data['Gender'])
age_encoder = preprocessing.LabelEncoder()
hr_data['Age_Group'] = age_encoder.fit_transform(hr_data['Age_Group'])

In [8]:
input_features = pd.concat([input_features, hr_data['MaritalStatusID'], hr_data['Age_Group'], attrition_data["Gender"], attrition_data['RelationshipSatisfaction'].astype(int)], axis=1)
input_features

Unnamed: 0_level_0,FRUITS_VEGGIES,DAILY_STRESS,BMI_RANGE,DONATION,PLACES_VISITED,CORE_CIRCLE,SUPPORTING_OTHERS,SOCIAL_NETWORK,SLEEP_HOURS,WEEKLY_MEDITATION,DAILY_SHOUTING,MaritalStatusID,Age_Group,Gender,RelationshipSatisfaction
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1.0,5,5,2,5,6,7,5,7,7.00,7.28,2.08,0,0,1,2
2.0,5,5,2,5,5,5,5,7,6.76,6.52,3.88,1,1,1,1
3.0,5,5,2,5,6,6,6,8,6.92,5.52,3.16,1,0,0,1
4.0,5,5,2,5,7,6,6,7,7.52,6.20,2.00,1,0,0,3
5.0,5,5,2,5,5,6,5,6,7.64,5.92,2.92,2,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307.0,5,4,2,5,6,6,5,6,7.12,6.00,1.76,0,0,1,2
308.0,5,5,2,5,5,6,7,7,6.92,5.80,3.08,0,0,0,3
309.0,5,5,2,5,5,7,8,7,6.80,4.68,3.08,0,1,0,2
310.0,5,5,2,5,5,7,7,7,7.08,6.12,3.52,0,1,0,1


In [30]:
input_features[['BMI_RANGE', 'DONATION', 'MaritalStatusID', 'Age_Group', 'Gender', 'RelationshipSatisfaction']] = input_features[['BMI_RANGE', 'DONATION', 'MaritalStatusID', 'Age_Group', 'Gender', 'RelationshipSatisfaction']].astype('category')

In [34]:
input_features.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 311 entries, 1.0 to 311.0
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   FRUITS_VEGGIES            311 non-null    int64   
 1   DAILY_STRESS              311 non-null    int64   
 2   BMI_RANGE                 311 non-null    category
 3   DONATION                  311 non-null    category
 4   PLACES_VISITED            311 non-null    int64   
 5   CORE_CIRCLE               311 non-null    int64   
 6   SUPPORTING_OTHERS         311 non-null    int64   
 7   SOCIAL_NETWORK            311 non-null    int64   
 8   SLEEP_HOURS               311 non-null    float64 
 9   WEEKLY_MEDITATION         311 non-null    float64 
 10  DAILY_SHOUTING            311 non-null    float64 
 11  MaritalStatusID           311 non-null    category
 12  Age_Group                 311 non-null    category
 13  Gender                    311 non-null    ca

#### Output Data Pre-Processing

In [9]:
output_features = lifestyle_data.loc[:,'WORK_LIFE_BALANCE_SCORE']
output_features = output_features.groupby('Employee_ID').mean()

output_features = (output_features - output_features.mean())/output_features.std()
#Mean = 2 and Std = 1
output_features = output_features*1 + 2


output_features

Employee_ID
1.0      2.694225
2.0      1.455491
3.0      2.154704
4.0      3.248040
5.0      0.017450
           ...   
307.0    3.183919
308.0    2.075879
309.0    2.685648
310.0    2.309086
311.0    3.328499
Name: WORK_LIFE_BALANCE_SCORE, Length: 311, dtype: float64

### Modelling

In [31]:
from sklearn.model_selection import train_test_split
input_train, input_test, ouput_train, output_test = train_test_split(input_features, output_features, test_size = 0.2, random_state=42)

In [32]:
from sklearn.svm import SVR
svm_regression = SVR(kernel='linear')
svm_regression.fit(input_train, ouput_train)
print(f"Train Score: {svm_regression.score(input_train, ouput_train)}")
print(f"Test Score: {svm_regression.score(input_test, output_test)}")
joblib.dump(svm_regression,'./../models/personal_wellbeing_model.pkl')

Train Score: 0.7034848634093418
Test Score: 0.7173867291316389


['./../models/personal_wellbeing_model.pkl']

In [39]:
columns = input_features.columns
coefs = svm_regression.coef_
for i in list(range(0,len(columns))):
    print(f"{columns[i]} Weight: {coefs[0][i]}\n")

FRUITS_VEGGIES Weight: -0.04322037840087489

DAILY_STRESS Weight: 0.137676203359284

BMI_RANGE Weight: 7.993605777301127e-15

DONATION Weight: 0.21732261820179843

PLACES_VISITED Weight: 0.234853521969832

CORE_CIRCLE Weight: 0.35395559110393826

SUPPORTING_OTHERS Weight: 0.3118908904445199

SOCIAL_NETWORK Weight: 0.3375197013742408

SLEEP_HOURS Weight: 0.25394648813008214

WEEKLY_MEDITATION Weight: 0.45596783543039976

DAILY_SHOUTING Weight: -0.3886729318712807

MaritalStatusID Weight: -0.027513756892920682

Age_Group Weight: 0.2949303355519741

Gender Weight: -0.09364033560906249

RelationshipSatisfaction Weight: 0.010970604557988928

