In [1]:
import numpy as np
import pandas as pd

In [2]:
data_preprocessed = pd.read_csv('files/absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Create targets
We create 2 classes: Moderately Absent and Excessively Absent
Cut-off value = Median Value

In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [5]:
data_preprocessed['Excessive Absenteeism'] = targets

In [6]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [7]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

### Selecting inputs for regression

In [26]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

### Standardize the data

In [27]:
from sklearn.preprocessing import StandardScaler

# absenteeism_scaler = StandardScaler()
# absenteeism_scaler.fit(unscaled_inputs)
# Need custom scaler since StandardScaler would transform the Dummy variables too

from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean = None
        self.var = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean = np.mean(X[self.columns])
        self.var = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [28]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [29]:
columns_to_scale = ['Month Value','Day of the week', 'Transportation Expense', 'Distance to Work',
                    'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets']

In [30]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [31]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month Value', 'Day of the week',
                      'Transportation Expense', 'Distance to Work', 'Age',
                      'Daily Work Load Average', 'Body Mass Index', 'Education',
                      'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [32]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

### Split into Train, Validation and Test data and shuffle

In [33]:
from sklearn.model_selection import train_test_split
 
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, shuffle=True, random_state=20)

### Logistic Regression 

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [35]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
reg.score(x_train, y_train)

0.7785714285714286

In [37]:
# Manual check accuracy
model_outputs = reg.predict(x_train)
np.sum(model_outputs == y_train)
np.sum(model_outputs == y_train) / model_outputs.shape[0]

0.7785714285714286

### Intercept and Coefficients

In [38]:
print(reg.intercept_, reg.coef_)

[-1.68969191] [[ 2.80088908e+00  9.37679263e-01  3.09784623e+00  8.55189097e-01
   1.65600560e-01 -8.38855532e-02  6.13417431e-01 -9.52263740e-03
  -1.66485910e-01 -9.39188260e-04  2.69858214e-01 -8.32092336e-02
   3.60649946e-01 -2.86053551e-01]]


In [39]:
feature_name = unscaled_inputs.columns.values

In [40]:
summary_table = pd.DataFrame(columns=['Features'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Features,Coefficient
0,Reason 1,2.800889
1,Reason 2,0.937679
2,Reason 3,3.097846
3,Reason 4,0.855189
4,Month Value,0.165601
5,Day of the week,-0.083886
6,Transportation Expense,0.613417
7,Distance to Work,-0.009523
8,Age,-0.166486
9,Daily Work Load Average,-0.000939


In [41]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Features,Coefficient
0,Intercept,-1.689692
1,Reason 1,2.800889
2,Reason 2,0.937679
3,Reason 3,3.097846
4,Reason 4,0.855189
5,Month Value,0.165601
6,Day of the week,-0.083886
7,Transportation Expense,0.613417
8,Distance to Work,-0.009523
9,Age,-0.166486


### Interpret the coefficients

In [42]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Features,Coefficient,Odds_ratio
0,Intercept,-1.689692,0.184576
1,Reason 1,2.800889,16.459274
2,Reason 2,0.937679,2.554047
3,Reason 3,3.097846,22.150193
4,Reason 4,0.855189,2.351819
5,Month Value,0.165601,1.180102
6,Day of the week,-0.083886,0.919536
7,Transportation Expense,0.613417,1.846732
8,Distance to Work,-0.009523,0.990523
9,Age,-0.166486,0.846635


In [43]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Features,Coefficient,Odds_ratio
3,Reason 3,3.097846,22.150193
1,Reason 1,2.800889,16.459274
2,Reason 2,0.937679,2.554047
4,Reason 4,0.855189,2.351819
7,Transportation Expense,0.613417,1.846732
13,Children,0.36065,1.434261
11,Body Mass Index,0.269858,1.309779
5,Month Value,0.165601,1.180102
10,Daily Work Load Average,-0.000939,0.999061
8,Distance to Work,-0.009523,0.990523


A feature is not particularly important if 
1. If its coefficient is around 0
2. If odds ratio is around 1