## Creating a logistic regression to predict absenteeism

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

### Load the data

In [3]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [4]:
data_preprocessed.head()

Unnamed: 0,# Day of week,Month,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,1.0,7.0,0.0,0.0,0.0,1.0,1.0,36.0,33.0,239,30.0,1.0,2.0,1.0,4.0
1,1.0,7.0,0.0,0.0,0.0,0.0,1.0,13.0,50.0,239,31.0,1.0,1.0,0.0,0.0
2,2.0,7.0,0.0,0.0,0.0,1.0,1.0,51.0,38.0,239,31.0,1.0,0.0,0.0,2.0
3,3.0,7.0,1.0,0.0,0.0,0.0,1.0,5.0,39.0,239,24.0,1.0,2.0,0.0,4.0
4,3.0,7.0,0.0,0.0,0.0,1.0,1.0,36.0,33.0,239,30.0,1.0,2.0,1.0,2.0


In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [7]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,# Day of week,Month,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,1.0,7.0,0.0,0.0,0.0,1.0,1.0,36.0,33.0,239,30.0,1.0,2.0,1.0,4.0,1
1,1.0,7.0,0.0,0.0,0.0,0.0,1.0,13.0,50.0,239,31.0,1.0,1.0,0.0,0.0,0
2,2.0,7.0,0.0,0.0,0.0,1.0,1.0,51.0,38.0,239,31.0,1.0,0.0,0.0,2.0,0
3,3.0,7.0,1.0,0.0,0.0,0.0,1.0,5.0,39.0,239,24.0,1.0,2.0,0.0,4.0,1
4,3.0,7.0,0.0,0.0,0.0,1.0,1.0,36.0,33.0,239,30.0,1.0,2.0,1.0,2.0,0


### A comment on the targets

In [8]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [13]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [14]:
data_with_targets is data_preprocessed

False

In [15]:
data_with_targets.head()

Unnamed: 0,# Day of week,Month,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,1.0,7.0,0.0,0.0,0.0,1.0,1.0,36.0,33.0,239,30.0,1.0,2.0,1.0,1
1,1.0,7.0,0.0,0.0,0.0,0.0,1.0,13.0,50.0,239,31.0,1.0,1.0,0.0,0
2,2.0,7.0,0.0,0.0,0.0,1.0,1.0,51.0,38.0,239,31.0,1.0,0.0,0.0,0
3,3.0,7.0,1.0,0.0,0.0,0.0,1.0,5.0,39.0,239,24.0,1.0,2.0,0.0,1
4,3.0,7.0,0.0,0.0,0.0,1.0,1.0,36.0,33.0,239,30.0,1.0,2.0,1.0,0


### Select the inputs for the regression

In [16]:
data_with_targets.shape

(700, 15)

In [21]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [22]:
from sklearn import preprocessing

In [23]:
#scaled_data = preprocessing.scale(unscaled_inputs)

In [29]:
#scaled_data[0]

In [27]:
absenteeism_scaler = preprocessing.StandardScaler()
absenteeism_scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [30]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

### Split the data into train & test and shuffle

#### Import the relevant module

In [31]:
from sklearn.model_selection import train_test_split

#### Split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, shuffle=True, random_state=20)

### Logistic regression with sklearn

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Training the model

In [37]:
reg = LogisticRegression()

In [38]:
reg.fit(x_train, y_train)
reg.score(x_train, y_train)



0.7589285714285714

#### Manually check the accuracy

In [39]:
model_outputs = reg.predict(x_train)

In [44]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7589285714285714

#### Finding the intercept and coeffients

In [45]:
reg.intercept_

array([-0.22981225])

In [46]:
reg.coef_

array([[-0.10554772,  0.24286443,  1.95993977,  0.30230518,  1.41758676,
         1.17459527,  0.        ,  0.07627728, -0.28807026, -0.00722667,
         0.32805566, -0.10653755,  0.572942  , -0.09295354]])

In [47]:
unscaled_inputs.columns.values

array(['# Day of week', 'Month', 'Reason_1', 'Reason_2', 'Reason_3',
       'Reason_4', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [48]:
feature_name = unscaled_inputs.columns.values

In [49]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,# Day of week,-0.105548
1,Month,0.242864
2,Reason_1,1.95994
3,Reason_2,0.302305
4,Reason_3,1.417587
5,Reason_4,1.174595
6,Transportation Expense,0.0
7,Distance to Work,0.076277
8,Age,-0.28807
9,Daily Work Load Average,-0.007227


## Save the model

In [50]:
import pickle

In [52]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [53]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)