# Load the data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_pre = pd.read_csv('data/df_preprocessed.csv')

In [3]:
data_pre.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# Why logistic regression?

Our goal is to determine whether someone will be moderately absent or excessively absent, which is a classification problem as moderately absent represents (0) and excessively absent is (1). Thus, with logistic regression having a range of 0 to 1, logistic regression is an appropriate model.

# Create the targets

In [4]:
data_pre['Absenteeism Time in Hours'].median() 

3.0

Moderately absent: 2 hours or below. Excessively absent: more than 2 hours

In [5]:
targets = np.where(data_pre['Absenteeism Time in Hours'] > data_pre['Absenteeism Time in Hours'].median() 
                   , 1, 0)

In [6]:
data_pre['Excessive Abseentism'] = targets

Using the median to determine whether an individual is moderately absent is effective since it balances our dataset.

In [7]:
targets.sum() / targets.shape[0] # around 37.5% of our dataset is excessively absent

0.45571428571428574

In [8]:
data_targets = data_pre.copy()

In [9]:
data_targets = data_pre.drop(['Absenteeism Time in Hours'], axis=1)

# Selecting the inputs for regression

In [10]:
unscaled_inputs = data_targets.iloc[:, :-1]

## Scaling inputs by standardization

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()

In [13]:
scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [14]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [15]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [16]:
scaled_inputs.shape

(700, 14)

## Splitting our data into training and testing 

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8,
                                                   random_state=50)

In [19]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [20]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


# Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [22]:
reg = LogisticRegression()

In [23]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
reg.score(x_train, y_train)

0.7678571428571429

## Manually check the accuracy

In [25]:
model_outputs = reg.predict(x_train)

In [26]:
model_outputs == y_train

array([False,  True,  True, False,  True, False,  True, False,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True, False,
        True, False,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True, False,
        True, False, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,

In [27]:
np.sum(model_outputs==y_train)

430

In [28]:
np.sum(model_outputs==y_train)/model_outputs.shape[0]

0.7678571428571429

In [29]:
reg.intercept_

array([-0.19764915])

In [30]:
reg.coef_

array([[ 2.03957066,  0.3674707 ,  1.54115309,  1.33303383,  0.1083976 ,
        -0.14899386,  0.65307256, -0.03355387, -0.1251805 ,  0.01076596,
         0.26951038,  0.10692072,  0.43099212, -0.3583513 ]])

In [31]:
columns = unscaled_inputs.columns.values

In [32]:
feature_name = unscaled_inputs.columns.values

In [33]:
sum_table = pd.DataFrame(columns=['Feature name'], data=columns)
sum_table['Coefficient'] = np.transpose(reg.coef_)
sum_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.039571
1,Reason_2,0.367471
2,Reason_3,1.541153
3,Reason_4,1.333034
4,Month Value,0.108398
5,Day of the Week,-0.148994
6,Transportation Expense,0.653073
7,Distance to Work,-0.033554
8,Age,-0.125181
9,Daily Work Load Average,0.010766


In [34]:
sum_table.index += 1

In [35]:
sum_table.loc[0] = ['Intercept', reg.intercept_[0]]
sum_table = sum_table.sort_index()
sum_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.197649
1,Reason_1,2.039571
2,Reason_2,0.367471
3,Reason_3,1.541153
4,Reason_4,1.333034
5,Month Value,0.108398
6,Day of the Week,-0.148994
7,Transportation Expense,0.653073
8,Distance to Work,-0.033554
9,Age,-0.125181


In [36]:
sum_table['odd_ratio'] = np.exp(sum_table.Coefficient)

In [37]:
sum_table.sort_values('odd_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,odd_ratio
1,Reason_1,2.039571,7.687308
3,Reason_3,1.541153,4.669972
4,Reason_4,1.333034,3.792532
7,Transportation Expense,0.653073,1.921435
13,Children,0.430992,1.538783
2,Reason_2,0.367471,1.444077
11,Body Mass Index,0.26951,1.309323
5,Month Value,0.108398,1.114491
12,Education,0.106921,1.112846
10,Daily Work Load Average,0.010766,1.010824


# Creating a custom standard scaler so we do not standardize the dummy variables

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

This customer scaler will allow us to transform the data on columns that are only not dummy variables.

In [39]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [44]:
columns_to_scale = ['Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets']

In [45]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [46]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month Value', 'Day of the Week',
                      'Transportation Expense', 'Distance to Work', 'Age',
                      'Daily Work Load Average', 'Body Mass Index', 'Education',
                      'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [47]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [63]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8,
                                                   random_state=142)

In [64]:
reg = LogisticRegression()

In [65]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
reg.score(x_train, y_train)

0.7767857142857143

In [67]:
sum_table = pd.DataFrame(columns=['Feature name'], data=columns)
sum_table['Coefficient'] = np.transpose(reg.coef_)
sum_table.index += 1
sum_table.loc[0] = ['Intercept', reg.intercept_[0]]
sum_table = sum_table.sort_index()
sum_table['odd_ratio'] = np.exp(sum_table.Coefficient)
sum_table.sort_values('odd_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,odd_ratio
3,Reason_3,3.126378,22.791273
1,Reason_1,3.011113,20.309989
4,Reason_4,1.034302,2.813143
7,Transportation Expense,0.694326,2.002359
2,Reason_2,0.562109,1.754369
13,Children,0.283649,1.327967
11,Body Mass Index,0.199677,1.221008
5,Month Value,0.049228,1.05046
12,Education,0.034625,1.035231
8,Distance to Work,-0.021385,0.978842


Interesting to note which features affect absenteeism. Mostly pregnancy, poisoning, appointments, transportation expensives, and illnesses, while day of the week and pets almost have no weight on the model as well.

For backward elimination, we can eliminate those with minimal effect, but they rarely change our model.

# Test accuracy

In [68]:
reg.score(x_test, y_test)

0.7357142857142858

The model's accuracy dropped by about 4%.

In [69]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.10157118, 0.89842882],
       [0.67765047, 0.32234953],
       [0.89354481, 0.10645519],
       [0.88928809, 0.11071191],
       [0.12581562, 0.87418438],
       [0.80774212, 0.19225788],
       [0.4982076 , 0.5017924 ],
       [0.60494911, 0.39505089],
       [0.69891256, 0.30108744],
       [0.84606788, 0.15393212],
       [0.24164555, 0.75835445],
       [0.45711108, 0.54288892],
       [0.87054506, 0.12945494],
       [0.32363102, 0.67636898],
       [0.81161917, 0.18838083],
       [0.14338319, 0.85661681],
       [0.74211035, 0.25788965],
       [0.83673804, 0.16326196],
       [0.83224785, 0.16775215],
       [0.16573106, 0.83426894],
       [0.49639699, 0.50360301],
       [0.47967271, 0.52032729],
       [0.69612049, 0.30387951],
       [0.79241241, 0.20758759],
       [0.83673804, 0.16326196],
       [0.590372  , 0.409628  ],
       [0.66859086, 0.33140914],
       [0.32170978, 0.67829022],
       [0.60347617, 0.39652383],
       [0.48418331, 0.51581669],
       [0.

In [70]:
predicted_proba[:,1] # predicting the probability of a value being 0 or 1

array([0.89842882, 0.32234953, 0.10645519, 0.11071191, 0.87418438,
       0.19225788, 0.5017924 , 0.39505089, 0.30108744, 0.15393212,
       0.75835445, 0.54288892, 0.12945494, 0.67636898, 0.18838083,
       0.85661681, 0.25788965, 0.16326196, 0.16775215, 0.83426894,
       0.50360301, 0.52032729, 0.30387951, 0.20758759, 0.16326196,
       0.409628  , 0.33140914, 0.67829022, 0.39652383, 0.51581669,
       0.4792219 , 0.64021942, 0.53356096, 0.25542361, 0.52811547,
       0.44144778, 0.5017924 , 0.19945339, 0.18694186, 0.56203814,
       0.25947397, 0.17185357, 0.16326196, 0.4113778 , 0.92035664,
       0.74255666, 0.14021176, 0.68616628, 0.88179308, 0.51208794,
       0.24780174, 0.42795054, 0.19101769, 0.22900533, 0.71414221,
       0.39117356, 0.97481367, 0.47959035, 0.6315512 , 0.16233111,
       0.64464439, 0.12312852, 0.86890575, 0.21851599, 0.10649883,
       0.50463136, 0.09067887, 0.92048756, 0.4778001 , 0.921876  ,
       0.70889301, 0.26585439, 0.45181668, 0.5501731 , 0.47017

# Saving the model

Python object -> file stream.

In [71]:
import pickle

In [72]:
with open('model', 'wb') as file:
    pickle.dump(reg,file)

In [73]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler,file)