In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
preprocessed_df= pd.read_csv("preprocessed_absenteeism_data.csv")
preprocessed_df

Unnamed: 0,group 1,group 2,group 3,group 4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2


In [3]:
preprocessed_df['Absenteeism Time in Hours'].median()

3.0

In [4]:
# The median of the absenteeism time in hours (Target Variable) column is 3, which implies that people with less than or 
# equal to 3 hours of absence are moderately absent and the people who are absent for more than 3 hours are excessively absent
# classifiying 0 as moderately absent and 1 as excessively absent

target= np.where(preprocessed_df['Absenteeism Time in Hours']>preprocessed_df['Absenteeism Time in Hours'].median(),1,0)
#target

In [5]:
preprocessed_df['Absenteeism']= target
preprocessed_df=preprocessed_df.drop(['Absenteeism Time in Hours'],axis=1)
preprocessed_df

Unnamed: 0,group 1,group 2,group 3,group 4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,0


In [6]:
# Inputs

inputs=preprocessed_df.iloc[:,0:14]
inputs

Unnamed: 0,group 1,group 2,group 3,group 4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


In [7]:
inputs.columns.values

array(['group 1', 'group 2', 'group 3', 'group 4', 'Month',
       'Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [8]:
#Standardizing the inputs
# excluding dummies from scaling 
scaler=StandardScaler()
scaler.fit(inputs[['Month','Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']])


In [9]:
scaled_inputs= scaler.transform(inputs[['Month','Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']])
scaled_inputs

array([[ 0.03079619, -0.80094984,  1.00584437, ...,  0.76743118,
         0.88046927,  0.26848661],
       [ 0.03079619, -0.80094984, -1.57468098, ...,  1.00263338,
        -0.01928035, -0.58968976],
       [ 0.03079619, -0.23290031, -0.6541427 , ...,  1.00263338,
        -0.91902997, -0.58968976],
       ...,
       [-0.56801869,  0.33514923,  1.62456682, ..., -0.40857982,
        -0.91902997, -0.58968976],
       [-0.56801869,  0.33514923,  0.19094163, ..., -0.40857982,
        -0.91902997, -0.58968976],
       [-0.56801869,  0.33514923,  1.03602595, ..., -0.40857982,
        -0.01928035,  0.26848661]])

In [10]:
unscaled_dummies= inputs[['group 1', 'group 2', 'group 3', 'group 4','Education']].to_numpy()
unscaled_dummies.shape

final_input= np.concatenate((scaled_inputs, unscaled_dummies), axis = 1)
final_input

array([[ 0.03079619, -0.80094984,  1.00584437, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.03079619, -0.80094984, -1.57468098, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03079619, -0.23290031, -0.6541427 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.56801869,  0.33514923,  1.62456682, ...,  0.        ,
         0.        ,  1.        ],
       [-0.56801869,  0.33514923,  0.19094163, ...,  0.        ,
         1.        ,  1.        ],
       [-0.56801869,  0.33514923,  1.03602595, ...,  0.        ,
         1.        ,  0.        ]])

In [11]:
# Splitting the data for training and testing 

train_test_split(final_input,target)

[array([[-1.46624102, -0.80094984,  0.04003371, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.62961108,  0.90319876, -1.01632169, ...,  0.        ,
          1.        ,  0.        ],
        [-0.86742613, -1.36899938,  0.04003371, ...,  1.        ,
          0.        ,  0.        ],
        ...,
        [-1.16683357, -0.23290031,  1.00584437, ...,  0.        ,
          1.        ,  0.        ],
        [ 0.33020364,  0.90319876,  0.56821142, ...,  0.        ,
          1.        ,  0.        ],
        [ 0.92901852,  1.4712483 ,  0.68893775, ...,  0.        ,
          0.        ,  1.        ]]),
 array([[-1.76564846,  0.33514923,  0.19094163, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.03079619,  0.90319876,  0.35694034, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.33020364, -0.80094984, -1.57468098, ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [-1.46624102, -0.23290031, -0.6541427 , ...,  

In [12]:
x_train,x_test,y_train,y_test= train_test_split(final_input,target,train_size=0.75,random_state=25)

In [13]:
# Verifying the shapes of the train and test data

print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(525, 14) (525,) (175, 14) (175,)


In [14]:
# Traning the model

lg= LogisticRegression()

In [15]:
lg.fit(x_train,y_train)

In [16]:
lg.score(x_train,y_train)

0.7714285714285715

In [17]:
input_copy= inputs.copy()
column_reorder= [ 'Month','Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets','group 1', 'group 2', 'group 3', 'group 4','Education']
input_copy= input_copy[column_reorder]
input_copy

Unnamed: 0,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets,group 1,group 2,group 3,group 4,Education
0,7,1,289,36,33,239.554,30,2,1,0,0,0,1,0
1,7,1,118,13,50,239.554,31,1,0,0,0,0,0,0
2,7,2,179,51,38,239.554,31,0,0,0,0,0,1,0
3,7,3,279,5,39,239.554,24,2,0,1,0,0,0,0
4,7,3,289,36,33,239.554,30,2,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,5,2,179,22,40,237.656,22,2,0,1,0,0,0,1
696,5,2,225,26,28,237.656,24,1,2,1,0,0,0,0
697,5,3,330,16,28,237.656,25,0,0,1,0,0,0,1
698,5,3,235,16,32,237.656,25,0,0,0,0,0,1,1


In [18]:
# Coeffcient table for better understanding how important each feature is

coefficient_table= pd.DataFrame(columns=['Feature'],data= input_copy.columns.values)
coefficient_table['Coefficient']= np.transpose(lg.coef_)
coefficient_table

Unnamed: 0,Feature,Coefficient
0,Month,-0.050551
1,Day of the week,-0.111032
2,Transportation Expense,0.680034
3,Distance to Work,-0.192622
4,Age,-0.318343
5,Daily Work Load Average,-0.100167
6,Body Mass Index,0.331501
7,Children,0.426544
8,Pets,-0.414447
9,group 1,2.854158


In [19]:
# Testing the model


lg.score(x_test,y_test)

0.6971428571428572

In [20]:
predicted_probability= lg.predict_proba(x_test)
#predicted_probability


In [21]:
predicted_values=lg.predict(x_test)
predicted_values.shape

(175,)

In [22]:
prediction_table= pd.DataFrame(x_test,columns=['Month','Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets','group 1', 'group 2', 'group 3', 'group 4','Education'])

In [25]:
prediction_table['Predicted Values']= predicted_values
prediction_table['Probability']= predicted_probability[:,[1]]
prediction_table

Unnamed: 0,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets,group 1,group 2,group 3,group 4,Education,Predicted Values,Probability
0,0.929019,0.335149,-0.654143,-0.263140,-1.006686,-0.643304,-1.819793,-0.919030,-0.589690,0.0,0.0,0.0,1.0,1.0,0,0.124755
1,1.527833,0.903199,0.568211,1.359154,-0.065439,0.769711,-0.878984,2.679969,-0.589690,0.0,0.0,0.0,1.0,0.0,1,0.530249
2,1.228426,0.335149,0.387122,-0.330735,1.660180,-0.082083,1.237836,0.880469,0.268487,1.0,0.0,0.0,0.0,0.0,1,0.823966
3,0.030796,0.903199,-1.574681,-1.344669,0.091435,-1.037971,0.297027,-0.919030,-0.589690,0.0,0.0,0.0,1.0,0.0,0,0.147574
4,-1.765648,0.903199,-0.654143,1.426749,0.248310,1.043433,1.002633,-0.919030,-0.589690,0.0,0.0,0.0,1.0,0.0,0,0.168639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,-1.466241,-0.800950,-0.578689,0.818389,-1.477309,0.769711,-1.349389,-0.919030,-0.589690,0.0,0.0,0.0,1.0,0.0,0,0.189130
171,0.030796,0.335149,-0.654143,-0.263140,-1.006686,0.087771,-1.819793,-0.919030,-0.589690,0.0,0.0,1.0,0.0,1.0,1,0.540493
172,0.330204,2.039298,0.387122,-0.330735,1.660180,-0.154696,1.237836,0.880469,0.268487,0.0,0.0,0.0,1.0,0.0,0,0.349702
173,-1.166834,-0.232900,0.085306,-1.074287,3.385799,-1.240355,-1.114186,0.880469,0.268487,0.0,0.0,0.0,1.0,0.0,0,0.171548


In [None]:
prediction_table.to_csv('preproce_absenteeism_data.csv', index=False)

### if the probability is below 0.5 it is considered as 0 and if the probability is above 0.5 it is considered as 1. The predicted value 0 implies that an employee is moderatley absent and the probability of him/her being absent is less than 50%. The predicted value 1 implies that an employee is excessively absent and the probability of him/her being absent is more than 50%.