In [1]:
#Import libraries
import pandas as pd
import numpy as np

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Absenteeism Time in Hours
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,4
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,4
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,2


In [3]:
#Targets
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']>data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head(10)

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,4,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,4,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,2,0
5,0,0,0,1,179,51,38,239.554,31,0,0,0,7,4,2,0
6,0,0,0,1,361,52,28,239.554,27,0,1,4,7,4,8,1
7,0,0,0,1,260,50,36,239.554,23,0,4,0,7,4,4,1
8,0,0,1,0,155,12,34,239.554,25,0,2,0,7,0,40,1
9,0,0,0,1,235,11,37,239.554,29,1,1,1,7,0,8,1


In [5]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [6]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Distance to Work',
                                            'Daily Work Load Average','Day of the Week'], axis=1)
data_with_targets.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value,Excessive Absenteeism
0,0,0,0,1,289,33,30,0,2,1,7,1
1,0,0,0,0,118,50,31,0,1,0,7,0
2,0,0,0,1,179,38,31,0,0,0,7,0
3,1,0,0,0,279,39,24,0,2,0,7,1
4,0,0,0,1,289,33,30,0,2,1,7,0


In [7]:
#Selecting the inputs for the logistic regression
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7


In [8]:
#Standardizing the data
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
unscaled_inputs.columns.values

array(['reason_1', 'reason_2', 'reason_3', 'reason_4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value'], dtype=object)

In [9]:
columns_to_omit = ['reason_1', 'reason_2', 'reason_3', 'reason_4','Education']

In [10]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
absenteeism_scaler = CustomScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
1,0,0,0,0,-1.574681,2.130803,1.002633,0,-0.01928,-0.58969,0.182726
2,0,0,0,1,-0.654143,0.24831,1.002633,0,-0.91903,-0.58969,0.182726
3,1,0,0,0,0.854936,0.405184,-0.643782,0,0.880469,-0.58969,0.182726
4,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726


In [11]:
#Splitting the date for training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, 
                                                    test_size=0.2, shuffle=True, random_state=20)

In [12]:
#Fitting the Model and Assessing it's accuracy
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
##training the model
reg = LogisticRegression()
reg.fit(x_train, y_train)
reg.score(x_train, y_train)



0.775

In [13]:
model_outputs = reg.predict(x_train)
np.sum(model_outputs == y_train)/model_outputs.shape[0]  #the accuracy from the model is correct!

0.775

In [14]:
feature_name = unscaled_inputs.columns.values
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,reason_1,2.627499
1,reason_2,0.863386
2,reason_3,2.960507
3,reason_4,0.663907
4,Transportation Expense,0.599798
5,Age,-0.172451
6,Body Mass Index,0.275685
7,Education,-0.234525
8,Children,0.342497
9,Pets,-0.277514


In [15]:
##Adding interception at the first place
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.465471
1,reason_1,2.627499
2,reason_2,0.863386
3,reason_3,2.960507
4,reason_4,0.663907
5,Transportation Expense,0.599798
6,Age,-0.172451
7,Body Mass Index,0.275685
8,Education,-0.234525
9,Children,0.342497


In [16]:
#Interpreting the coefficients for our Problem
summary_table['odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,odds_ratio
3,reason_3,2.960507,19.307751
1,reason_1,2.627499,13.839121
2,reason_2,0.863386,2.371177
4,reason_4,0.663907,1.942367
5,Transportation Expense,0.599798,1.821751
9,Children,0.342497,1.40846
7,Body Mass Index,0.275685,1.317433
11,Month Value,0.154937,1.167585
6,Age,-0.172451,0.841599
8,Education,-0.234525,0.790946


In [17]:
#Testing the model
reg.score(x_test, y_test)

0.75

In [18]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba.shape

(140, 2)

In [19]:
predicted_proba[:,1]

array([0.28778024, 0.41239991, 0.55662562, 0.22096038, 0.91541657,
       0.66896629, 0.70207504, 0.87043779, 0.21692179, 0.25291341,
       0.50485031, 0.77359703, 0.92969016, 0.26495948, 0.69466915,
       0.44964119, 0.44972574, 0.46069558, 0.59882226, 0.94679318,
       0.30125385, 0.22096038, 0.58365437, 0.58365437, 0.7587085 ,
       0.25682913, 0.48934806, 0.14296697, 0.80065765, 0.22096038,
       0.37028423, 0.68316787, 0.68825755, 0.52694241, 0.22096038,
       0.53492642, 0.22453007, 0.74389237, 0.40329273, 0.60301627,
       0.21343976, 0.45483346, 0.2403088 , 0.4388431 , 0.82622935,
       0.57857132, 0.69461059, 0.28778024, 0.22209028, 0.2061074 ,
       0.57577123, 0.36438663, 0.66896629, 0.27128561, 0.83334736,
       0.43399232, 0.88600663, 0.23396355, 0.37170685, 0.38209505,
       0.69796139, 0.65909803, 0.29392197, 0.79686146, 0.20956093,
       0.2699923 , 0.10399887, 0.22453007, 0.73944244, 0.30081832,
       0.22453007, 0.32688766, 0.90337554, 0.45745729, 0.59997

In [20]:
#Saving the model and Preparing it for Deployment
import pickle

In [21]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [22]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)