In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [201]:
data = pd.read_csv("data/Absenteeism_with_targets.csv")
data

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day_Of_Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,0


In [202]:
inputs = data.drop("Absenteeism",axis=1) # Set inputs
targets = data["Absenteeism"] # Set outputs

In [203]:
import statsmodels.api as sm

X = sm.add_constant(inputs)
summary_ = sm.OLS(targets,X).fit()
summary_.summary()

0,1,2,3
Dep. Variable:,Absenteeism,R-squared:,0.326
Model:,OLS,Adj. R-squared:,0.312
Method:,Least Squares,F-statistic:,23.65
Date:,"Fri, 26 Feb 2021",Prob (F-statistic):,6.58e-50
Time:,16:50:31,Log-Likelihood:,-367.29
No. Observations:,700,AIC:,764.6
Df Residuals:,685,BIC:,832.8
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.6295,0.222,-2.829,0.005,-1.066,-0.193
Reason_1,0.8308,0.076,10.992,0.000,0.682,0.979
Reason_2,0.5493,0.185,2.964,0.003,0.185,0.913
Reason_3,0.8998,0.087,10.339,0.000,0.729,1.071
Reason_4,0.4212,0.072,5.846,0.000,0.280,0.563
Month,0.0063,0.005,1.345,0.179,-0.003,0.015
Day_Of_Week,-0.0215,0.011,-1.964,0.050,-0.043,-9.05e-06
Transportation Expense,0.0019,0.000,6.359,0.000,0.001,0.002
Distance to Work,-0.0009,0.001,-0.768,0.443,-0.003,0.001

0,1,2,3
Omnibus:,51.662,Durbin-Watson:,1.727
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21.183
Skew:,0.186,Prob(JB):,2.51e-05
Kurtosis:,2.234,Cond. No.,5450.0


In [204]:
inputs.drop(["Daily Work Load Average","Distance to Work","Month","Education"],axis=1,inplace=True)

In [205]:
from sklearn.model_selection import train_test_split # Split the test

X_train,X_test,y_train,y_test = train_test_split(inputs,targets,test_size=0.2,random_state=19)

In [206]:
from sklearn.preprocessing import StandardScaler # Standardize the data

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [207]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(X_train_scaled,y_train)
log_preds = log.predict(X_test_scaled)
log.score(X_test_scaled,y_test)

0.7714285714285715

In [208]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,log_preds)

array([[66, 20],
       [12, 42]])

In [209]:
log.intercept_

array([-0.11802076])

In [210]:
log.coef_

array([[ 1.91147693,  0.3216123 ,  1.51595957,  1.28209503, -0.24439295,
         0.89334207, -0.25080072,  0.27687632,  0.42661414, -0.48711612]])

In [211]:
feature_name = inputs.columns.values
feature_name

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Day_Of_Week',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Children',
       'Pets'], dtype=object)

In [212]:
summary_table = pd.DataFrame(columns = ["Feature name"],data=feature_name)
summary_table

Unnamed: 0,Feature name
0,Reason_1
1,Reason_2
2,Reason_3
3,Reason_4
4,Day_Of_Week
5,Transportation Expense
6,Age
7,Body Mass Index
8,Children
9,Pets


In [213]:
summary_table["Coefficient"] = np.transpose(log.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.911477
1,Reason_2,0.321612
2,Reason_3,1.51596
3,Reason_4,1.282095
4,Day_Of_Week,-0.244393
5,Transportation Expense,0.893342
6,Age,-0.250801
7,Body Mass Index,0.276876
8,Children,0.426614
9,Pets,-0.487116


In [214]:
coef_df = pd.DataFrame([["Interception",log.intercept_[0]]],columns=["Feature name","Coefficient"])
summary_table = summary_table.append(coef_df,ignore_index=True)

In [215]:
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.911477
1,Reason_2,0.321612
2,Reason_3,1.51596
3,Reason_4,1.282095
4,Day_Of_Week,-0.244393
5,Transportation Expense,0.893342
6,Age,-0.250801
7,Body Mass Index,0.276876
8,Children,0.426614
9,Pets,-0.487116


In [216]:
summary_table["Odds_ratio"] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Reason_1,1.911477,6.76307
1,Reason_2,0.321612,1.37935
2,Reason_3,1.51596,4.553789
3,Reason_4,1.282095,3.604183
4,Day_Of_Week,-0.244393,0.78318
5,Transportation Expense,0.893342,2.443282
6,Age,-0.250801,0.778177
7,Body Mass Index,0.276876,1.319003
8,Children,0.426614,1.532061
9,Pets,-0.487116,0.614396


In [217]:
summary_table.sort_values(by="Odds_ratio",ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Reason_1,1.911477,6.76307
2,Reason_3,1.51596,4.553789
3,Reason_4,1.282095,3.604183
5,Transportation Expense,0.893342,2.443282
8,Children,0.426614,1.532061
1,Reason_2,0.321612,1.37935
7,Body Mass Index,0.276876,1.319003
10,Interception,-0.118021,0.888678
4,Day_Of_Week,-0.244393,0.78318
6,Age,-0.250801,0.778177
