In [196]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [197]:
df = pd.read_csv("df_preprocessed.csv")

In [198]:
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [199]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Reason_1                   700 non-null    int64  
 1   Reason_2                   700 non-null    int64  
 2   Reason_3                   700 non-null    int64  
 3   Reason_4                   700 non-null    int64  
 4   Month Value                700 non-null    int64  
 5   Day of the Week            700 non-null    int64  
 6   Transportation Expense     700 non-null    int64  
 7   Distance to Work           700 non-null    int64  
 8   Age                        700 non-null    int64  
 9   Daily Work Load Average    700 non-null    float64
 10  Body Mass Index            700 non-null    int64  
 11  Education                  700 non-null    int64  
 12  Children                   700 non-null    int64  
 13  Pets                       700 non-null    int64  

## Application of model
#### Create logistic regression for analysis

In [200]:
# select the features and target
feat_X = df.iloc[:,:-1]

In [201]:
feat_X.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [202]:
# target colum has to preprocess
df['Absenteeism Time in Hours'].unique()

array([  4,   0,   2,   8,  40,   1,   7,   3,  32,   5,  16,  24,  64,
        56,  80, 120, 112, 104,  48], dtype=int64)

In [203]:
df['Absenteeism Time in Hours'].median()

3.0

##### for target column median is 3 , meaning value below 3 is accepted or normal but above 3 is not accepted as it is badly affecting the company project completion hours 

In [204]:
target = np.where(df['Absenteeism Time in Hours']>df['Absenteeism Time in Hours'].median(),1,0)

In [205]:
target[:10]

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1])

In [206]:
df['Excessive_absenteeism']=target

In [207]:
df = df.drop('Absenteeism Time in Hours',axis=1)

In [208]:
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive_absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [209]:
# target colm selection
target_y = df['Excessive_absenteeism']

In [210]:
target_y.sum()/target.shape[0]

0.45571428571428574

#### we can say 45% of data for 1 and remaining 55% data is of 0=our data is quiet balanced and we can proceed with the same¶

### Data standardization

In [211]:
feat_X.shape

(700, 14)

In [212]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
# code to drop dummies from standardization process
class CustomScaler(BaseEstimator,TransformerMixin):
    def __init__(self,columns):   

        # scaler is nothing but a Standard Scaler object

        self.scaler = StandardScaler()

        # with some columns 'twist'

        self.columns = columns
        
         # the fit method, which, again based on StandardScale

    def fit(self, X, y=None):

        self.scaler.fit(X[self.columns], y)

        self.mean_ = np.mean(X[self.columns])

        self.var_ = np.var(X[self.columns])

        return self

   

    # the transform method which does the actual scaling

    def transform(self, X, y=None):

        # record the initial order of the columns

        init_col_order = X.columns

       

        # scale all features that you chose when creating the instance of the class

        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)

       

        # declare a variable containing all information that was not scaled

        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]

       

        # return a data frame which contains all scaled features and all 'not scaled' features

        # use the original order (that you recorded in the beginning)

        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [213]:

unscaled_X = df.drop([ 'Month Value','Day of the Week', 'Distance to Work','Excessive_absenteeism'],axis=1)
unscaled_X.columns.values
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']
columns_to_scale = [x for x in unscaled_X.columns.values if x not in columns_to_omit]

columns_to_scale

['Transportation Expense',
 'Age',
 'Daily Work Load Average',
 'Body Mass Index',
 'Children',
 'Pets']

In [214]:
absenteeism_scaler=CustomScaler(columns_to_scale)

In [215]:
absenteeism_scaler.fit(unscaled_X)
scaled_inputs = absenteeism_scaler.transform(unscaled_X)

In [216]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,1.005844,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,-1.574681,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,-0.654143,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.854936,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,1.005844,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.654143,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,0.040034,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,1.624567,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,0.190942,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


In [217]:
## Split the data into train & test and shuffle

In [218]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, target, train_size = 0.8,random_state=42 )
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [219]:
reg = LogisticRegression()
reg.fit(x_train,y_train)
reg.score(x_train,y_train)

0.7678571428571429

In [220]:
model_outputs = reg.predict(x_train)
model_outputs

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [221]:
feature_names = unscaled_X.columns.values
# create the coeff of feature and there values in dataframe
summary = pd.DataFrame(columns=['Feature_names'],data=feature_names)
summary['coefficients']= np.transpose(reg.coef_)
summary

Unnamed: 0,Feature_names,coefficients
0,Reason_1,2.903424
1,Reason_2,0.747626
2,Reason_3,3.086181
3,Reason_4,0.927601
4,Transportation Expense,0.65693
5,Age,-0.255136
6,Daily Work Load Average,-0.039143
7,Body Mass Index,0.252322
8,Education,-0.260745
9,Children,0.403163


In [222]:
summary.index= summary.index+1
summary.loc[0]=['intercept',reg.intercept_[0]]
summary = summary.sort_index()
summary

Unnamed: 0,Feature_names,coefficients
0,intercept,-1.673128
1,Reason_1,2.903424
2,Reason_2,0.747626
3,Reason_3,3.086181
4,Reason_4,0.927601
5,Transportation Expense,0.65693
6,Age,-0.255136
7,Daily Work Load Average,-0.039143
8,Body Mass Index,0.252322
9,Education,-0.260745


In [223]:
summary['odds_ratio']=np.exp(summary['coefficients'])
summary.sort_values('odds_ratio',ascending=False)

Unnamed: 0,Feature_names,coefficients,odds_ratio
3,Reason_3,3.086181,21.893313
1,Reason_1,2.903424,18.236479
4,Reason_4,0.927601,2.528437
2,Reason_2,0.747626,2.111981
5,Transportation Expense,0.65693,1.928862
10,Children,0.403163,1.496551
8,Body Mass Index,0.252322,1.287011
7,Daily Work Load Average,-0.039143,0.961613
6,Age,-0.255136,0.774811
9,Education,-0.260745,0.770477


# reading of coeff and odds ratio
1. if coeff value near to 0 or exact 0 and odds ratio is 1 the corresponding feature is not important
example = Daily Work Load Average coeff = 0.004564 and odds ratio = 0.995447 
the above feature is not important

similarly : Distance to Work, day of the week are also not the important features
    
    
### very important check point

whenever we scaled our data we scalled our dummy variables to which is not the correct option

while scalling the data we should skip or drop the dummies while scalling but we need to consider them as your features while fiting the algorithm and train test data

## Testing the model

In [224]:
reg.score(x_test,y_test)


0.7642857142857142

In [225]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba[:,1]

array([0.17062412, 0.13886366, 0.21341362, 0.40706332, 0.39879058,
       0.91623831, 0.30035403, 0.62899915, 0.27495477, 0.24095425,
       0.14068394, 0.29547213, 0.73731031, 0.5378163 , 0.29357257,
       0.52507556, 0.10987523, 0.77222839, 0.1435588 , 0.40402902,
       0.27942424, 0.23257114, 0.29027491, 0.29936081, 0.12718359,
       0.82711477, 0.41146609, 0.4069387 , 0.23409773, 0.37706173,
       0.12339832, 0.1277041 , 0.61899285, 0.53590814, 0.27779598,
       0.66143618, 0.29027491, 0.12975929, 0.84569879, 0.21056506,
       0.51781672, 0.23795334, 0.62886256, 0.13291399, 0.24354733,
       0.74983393, 0.78550888, 0.87670397, 0.29576627, 0.13415366,
       0.24095425, 0.28510115, 0.40243619, 0.93612163, 0.12935444,
       0.24354733, 0.97532738, 0.27779598, 0.87160623, 0.2389577 ,
       0.60098169, 0.12953609, 0.49321807, 0.63163376, 0.14068394,
       0.40056691, 0.66577459, 0.05721216, 0.28465291, 0.52414104,
       0.24095425, 0.24043104, 0.68131145, 0.28510115, 0.13429

In [None]:
# Save the model

In [226]:
import pickle
with open('model', 'wb') as file:
    pickle.dump(reg, file)

with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)