# Lesson 42b: Machine Learning - Busines Case Example - Absenteeism

## Create a logistic regression

### Import libraries

In [1]:
import pandas as pd
import numpy as np

### Load data

In [4]:
data_preprocessed = pd.read_csv("Absenteeism_preprocessed.csv")
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [5]:
# We will use the logistic regression as this is a classification problem. This regression is useful when we have 
# two targets. We will consider normal absenteeism (below the median) and extensive absenteeism (above the median).

In [8]:
data_preprocessed["Absenteeism Time in Hours"].median()

3.0

In [10]:
targets = np.where(data_preprocessed["Absenteeism Time in Hours"] > 
                   data_preprocessed["Absenteeism Time in Hours"].median(), 1, 0)

In [11]:
data_preprocessed["Excessive Absenteeism"] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [14]:
# Median is a good indicator because we could naturally balance the data: numbers of 0 and 1 are similar.
# Check:
targets.sum()/targets.shape[0]

0.45571428571428574

In [19]:
data_with_targets = data_preprocessed.drop(["Absenteeism Time in Hours"], axis = 1)
data_with_targets is data_preprocessed

False

## Select inputs

In [21]:
data_with_targets.shape

(700, 15)

In [23]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

## Standardize the data

In [28]:
from sklearn.preprocessing import StandardScaler

absenteeism_scaler = StandardScaler()

# Preparing a scaling:
absenteeism_scaler.fit(unscaled_inputs)

# Applying scaling:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

scaled_inputs.shape

(700, 14)

## Split the data into train and test, and shuffle

In [29]:
from sklearn.model_selection import train_test_split

In [41]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [42]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(560, 14) (140, 14) (560,) (140,)


## Logistic regression

In [43]:
# We use here sklearn and not Statsmodels because sklearn is more stable for complex data.

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training model

In [46]:
reg = LogisticRegression()

In [50]:
reg.fit(x_train, y_train)

LogisticRegression()

In [52]:
# To get a mean accuracy:
reg.score(x_train, y_train)

0.7839285714285714

## Manually check the accuracy

In [54]:
# Finding predicted outputs of the regression:

model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [55]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [58]:
model_outputs == y_train
np.sum(model_outputs == y_train)/y_train.shape[0]

0.7839285714285714

## Finding coefficients and intercept

In [59]:
# We need a function for an external use (as for Tableu), and thus we need coefficients and weights:

In [60]:
reg.intercept_

array([-0.22206736])

In [61]:
reg.coef_

array([[ 2.07601767,  0.33504757,  1.56162303,  1.32927434,  0.18793677,
        -0.07062253,  0.70639316, -0.03986811, -0.20089491, -0.00456366,
         0.31933564, -0.135508  ,  0.38172443, -0.3332426 ]])

In [66]:
# To get input names we refer to unscaled_inputs as this is dataframe which has names.
# scaled_inputs are arrays, where names are not given.

feature_name = unscaled_inputs.columns.values

In [69]:
summary_table = pd.DataFrame(columns = ["Feature Name"], data = feature_name)
summary_table["Coefficients"] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Reason_1,2.076018
1,Reason_2,0.335048
2,Reason_3,1.561623
3,Reason_4,1.329274
4,Month value,0.187937
5,Day of the week,-0.070623
6,Transportation Expense,0.706393
7,Distance to Work,-0.039868
8,Age,-0.200895
9,Daily Work Load Average,-0.004564


In [70]:
# Adding an intercept in the first row:

summary_table.index = summary_table.index + 1
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]