In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
data_preprocessed = pd.read_csv("Absenteeism_preprocessed.csv")
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,DayOfTheWeek,TransportationExpense,DistanceToWork,Age,DailyWorkLoadAverage,BodyMassIndex,Education,Children,Pets,AbsenteeismTimeInHours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2


In [3]:
data_preprocessed["AbsenteeismTimeInHours"].median()

3.0

In [4]:
targets = np.where(data_preprocessed["AbsenteeismTimeInHours"] > data_preprocessed["AbsenteeismTimeInHours"].median(), 1, 0)

In [5]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed["ExcessiveAbsenteeism"] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,DayOfTheWeek,TransportationExpense,DistanceToWork,Age,DailyWorkLoadAverage,BodyMassIndex,Education,Children,Pets,AbsenteeismTimeInHours,ExcessiveAbsenteeism
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4,1
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2,0


In [7]:
data_targets = data_preprocessed.drop(["AbsenteeismTimeInHours"], axis = 1)

In [8]:
unscaled_inputs = data_targets.iloc[:,:-1]

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
 
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self
 
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), 
        columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [10]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'DayOfTheWeek', 'TransportationExpense', 'DistanceToWork', 'Age',
       'DailyWorkLoadAverage', 'BodyMassIndex', 'Education', 'Children',
       'Pets'], dtype=object)

In [11]:
columnsToOmit = ["Reason_1", "Reason_2", "Reason_3", "Reason_4", "Education"]

In [12]:
columnsToScale = [x for x in unscaled_inputs.columns.values if x not in columnsToOmit]

In [13]:
absenteesism_scaler = CustomScaler(columnsToScale)

In [14]:
absenteesism_scaler.fit(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [15]:
scale_inputs = absenteesism_scaler.transform(unscaled_inputs)

In [16]:
scale_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,DayOfTheWeek,TransportationExpense,DistanceToWork,Age,DailyWorkLoadAverage,BodyMassIndex,Education,Children,Pets
0,False,False,False,True,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,False,False,False,False,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,False,False,False,True,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,True,False,False,False,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,False,False,False,True,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,True,False,False,False,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,True,False,False,False,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,False,False,False,True,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


In [17]:
scale_inputs.shape

(700, 14)

In [18]:
train_test_split(scale_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  DayOfTheWeek  \
 191      True     False     False     False    -0.673803     -0.007725   
 174     False     False     False      True    -0.959313     -0.007725   
 300     False     False     False     False     1.039256      1.344231   
 332      True     False     False     False     1.324766     -1.359682   
 460     False     False     False      True     0.182726      0.668253   
 ..        ...       ...       ...       ...          ...           ...   
 57      False     False     False      True     0.753746     -0.007725   
 145      True     False     False     False    -1.244823     -0.683704   
 71      False     False     False      True     1.039256      0.668253   
 670     False     False     False      True    -0.673803     -0.683704   
 697      True     False     False     False    -0.388293      0.668253   
 
      TransportationExpense  DistanceToWork       Age  DailyWorkLoadAverage  \
 191              -

In [19]:
x_train, x_test, y_train, y_test = train_test_split(scale_inputs, targets, train_size = 0.8, shuffle=True)

In [20]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

In [21]:
reg.score(x_train, y_train)

0.7660714285714286

In [22]:
reg.intercept_

array([-1.87752957])

In [23]:
reg.coef_

array([[ 3.01396295,  0.21135307,  3.0595357 ,  1.12460199, -0.01380687,
        -0.17959577,  0.71117063, -0.06586354, -0.13989632, -0.04980138,
         0.23517782, -0.07375159,  0.38845147, -0.2745865 ]])

In [24]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'DayOfTheWeek', 'TransportationExpense', 'DistanceToWork', 'Age',
       'DailyWorkLoadAverage', 'BodyMassIndex', 'Education', 'Children',
       'Pets'], dtype=object)

In [25]:
feature_names = unscaled_inputs.columns.values

In [26]:
summary_table = pd.DataFrame(columns=["FeatureName"], data= feature_names)

In [27]:
summary_table

Unnamed: 0,FeatureName
0,Reason_1
1,Reason_2
2,Reason_3
3,Reason_4
4,Month Value
5,DayOfTheWeek
6,TransportationExpense
7,DistanceToWork
8,Age
9,DailyWorkLoadAverage


In [28]:
summary_table["Coefficient"] = np.transpose(reg.coef_)

In [29]:
summary_table

Unnamed: 0,FeatureName,Coefficient
0,Reason_1,3.013963
1,Reason_2,0.211353
2,Reason_3,3.059536
3,Reason_4,1.124602
4,Month Value,-0.013807
5,DayOfTheWeek,-0.179596
6,TransportationExpense,0.711171
7,DistanceToWork,-0.065864
8,Age,-0.139896
9,DailyWorkLoadAverage,-0.049801


In [30]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,FeatureName,Coefficient
0,Intercept,-1.87753
1,Reason_1,3.013963
2,Reason_2,0.211353
3,Reason_3,3.059536
4,Reason_4,1.124602
5,Month Value,-0.013807
6,DayOfTheWeek,-0.179596
7,TransportationExpense,0.711171
8,DistanceToWork,-0.065864
9,Age,-0.139896


In [31]:
summary_table["OddsRatio"] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,FeatureName,Coefficient,OddsRatio
0,Intercept,-1.87753,0.152968
1,Reason_1,3.013963,20.367957
2,Reason_2,0.211353,1.235348
3,Reason_3,3.059536,21.317657
4,Reason_4,1.124602,3.078991
5,Month Value,-0.013807,0.986288
6,DayOfTheWeek,-0.179596,0.835608
7,TransportationExpense,0.711171,2.036374
8,DistanceToWork,-0.065864,0.936259
9,Age,-0.139896,0.869448


In [32]:
summary_table.sort_values("OddsRatio", ascending = False)

Unnamed: 0,FeatureName,Coefficient,OddsRatio
3,Reason_3,3.059536,21.317657
1,Reason_1,3.013963,20.367957
4,Reason_4,1.124602,3.078991
7,TransportationExpense,0.711171,2.036374
13,Children,0.388451,1.474695
11,BodyMassIndex,0.235178,1.265134
2,Reason_2,0.211353,1.235348
5,Month Value,-0.013807,0.986288
10,DailyWorkLoadAverage,-0.049801,0.951418
8,DistanceToWork,-0.065864,0.936259


# Testing the model
---

In [33]:
reg.score(x_test, y_test)

0.7785714285714286

In [34]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.69384565, 0.30615435],
       [0.12245164, 0.87754836],
       [0.47378142, 0.52621858],
       [0.83003299, 0.16996701],
       [0.77150919, 0.22849081],
       [0.82396108, 0.17603892],
       [0.19099863, 0.80900137],
       [0.58987532, 0.41012468],
       [0.95194731, 0.04805269],
       [0.78024636, 0.21975364],
       [0.71115408, 0.28884592],
       [0.77948442, 0.22051558],
       [0.64832024, 0.35167976],
       [0.44730743, 0.55269257],
       [0.20793351, 0.79206649],
       [0.57684886, 0.42315114],
       [0.40131935, 0.59868065],
       [0.87622094, 0.12377906],
       [0.10533927, 0.89466073],
       [0.63471008, 0.36528992],
       [0.70868707, 0.29131293],
       [0.28843269, 0.71156731],
       [0.01927953, 0.98072047],
       [0.84589184, 0.15410816],
       [0.25013503, 0.74986497],
       [0.72902216, 0.27097784],
       [0.64109559, 0.35890441],
       [0.26627921, 0.73372079],
       [0.89230391, 0.10769609],
       [0.64184601, 0.35815399],
       [0.

In [40]:
predicted_proba.shape

(140, 2)

In [41]:
predicted_proba[:,1]

array([0.30615435, 0.87754836, 0.52621858, 0.16996701, 0.22849081,
       0.17603892, 0.80900137, 0.41012468, 0.04805269, 0.21975364,
       0.28884592, 0.22051558, 0.35167976, 0.55269257, 0.79206649,
       0.42315114, 0.59868065, 0.12377906, 0.89466073, 0.36528992,
       0.29131293, 0.71156731, 0.98072047, 0.15410816, 0.74986497,
       0.27097784, 0.35890441, 0.73372079, 0.10769609, 0.35815399,
       0.20568693, 0.30837863, 0.30296052, 0.18851961, 0.86188316,
       0.32538251, 0.7656604 , 0.1809524 , 0.41359319, 0.51432527,
       0.78272432, 0.68764043, 0.66916588, 0.28936475, 0.14448621,
       0.17348988, 0.16041665, 0.58300028, 0.33204   , 0.87603133,
       0.74444346, 0.14518902, 0.1861559 , 0.29088427, 0.71891392,
       0.74405723, 0.17858323, 0.8889333 , 0.29058276, 0.90791389,
       0.21586827, 0.22730461, 0.279932  , 0.26107964, 0.26456046,
       0.75448513, 0.35674975, 0.53197439, 0.91865967, 0.12350429,
       0.11235133, 0.92382059, 0.30598951, 0.15547954, 0.67153

# Save the model
---

In [42]:
import pickle

In [43]:
with open("model", "wb") as file:
	pickle.dump(reg, file)

In [45]:
with open('scaler','wb') as file:
    pickle.dump(absenteesism_scaler, file)