In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("data/Absenteeism_with_targets.csv")
data

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day_Of_Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,0


In [3]:
inputs = data.drop(["Absenteeism","Day_Of_Week","Daily Work Load Average","Distance to Work"],axis=1) # Set inputs
targets = data["Absenteeism"] # Set outputs

In [4]:
inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [5]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [6]:
columns_to_scale = [x for x in inputs.columns.values if x not in columns_to_omit]
columns_to_scale

['Month',
 'Transportation Expense',
 'Age',
 'Body Mass Index',
 'Children',
 'Pets']

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [8]:
scaler = CustomScaler(columns_to_scale)
scaler.fit(inputs)
input_scaled = scaler.transform(inputs)



In [9]:
input_scaled

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [10]:
from sklearn.model_selection import train_test_split # Split the test

X_train,X_test,y_train,y_test = train_test_split(input_scaled,targets,test_size=0.2,random_state=19)

In [11]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(X_train,y_train)
log_preds = log.predict(X_test)
log.score(X_test,y_test)

0.7928571428571428

In [12]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,log_preds)

array([[69, 17],
       [12, 42]])

In [13]:
log.intercept_

array([-1.52306265])

In [14]:
log.coef_

array([[ 2.56421769,  0.93578633,  3.03524558,  0.85894991,  0.0333928 ,
         0.75738562, -0.24754871,  0.23792485, -0.08521376,  0.38016023,
        -0.41412691]])

In [15]:
feature_name = inputs.columns.values
feature_name

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [16]:
summary_table = pd.DataFrame(columns = ["Feature name"],data=feature_name)
summary_table

Unnamed: 0,Feature name
0,Reason_1
1,Reason_2
2,Reason_3
3,Reason_4
4,Month
5,Transportation Expense
6,Age
7,Body Mass Index
8,Education
9,Children


In [17]:
summary_table["Coefficient"] = np.transpose(log.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.564218
1,Reason_2,0.935786
2,Reason_3,3.035246
3,Reason_4,0.85895
4,Month,0.033393
5,Transportation Expense,0.757386
6,Age,-0.247549
7,Body Mass Index,0.237925
8,Education,-0.085214
9,Children,0.38016


In [18]:
coef_df = pd.DataFrame([["Interception",log.intercept_[0]]],columns=["Feature name","Coefficient"])
summary_table = summary_table.append(coef_df,ignore_index=True)

In [19]:
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.564218
1,Reason_2,0.935786
2,Reason_3,3.035246
3,Reason_4,0.85895
4,Month,0.033393
5,Transportation Expense,0.757386
6,Age,-0.247549
7,Body Mass Index,0.237925
8,Education,-0.085214
9,Children,0.38016


In [20]:
summary_table["Odds_ratio"] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Reason_1,2.564218,12.990492
1,Reason_2,0.935786,2.549217
2,Reason_3,3.035246,20.806087
3,Reason_4,0.85895,2.36068
4,Month,0.033393,1.033957
5,Transportation Expense,0.757386,2.132693
6,Age,-0.247549,0.780712
7,Body Mass Index,0.237925,1.268614
8,Education,-0.085214,0.918316
9,Children,0.38016,1.462519


In [21]:
summary_table.sort_values(by="Odds_ratio",ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
2,Reason_3,3.035246,20.806087
0,Reason_1,2.564218,12.990492
1,Reason_2,0.935786,2.549217
3,Reason_4,0.85895,2.36068
5,Transportation Expense,0.757386,2.132693
9,Children,0.38016,1.462519
7,Body Mass Index,0.237925,1.268614
4,Month,0.033393,1.033957
8,Education,-0.085214,0.918316
6,Age,-0.247549,0.780712


**Note**

Reason_1: Various Disease
    
Reason_2: Poisoning
    
Reason_3: Pregnancy or giving birth
    
Reason_4: Light Disease

In [24]:
import pickle

with open('data/model','wb') as file:
    pickle.dump(log,file)

with open('data/scaler','wb') as file:
    pickle.dump(scaler,file)