# Predict the absenteeism hours (below or about average) in the future

# Data precprocessing and save

In [37]:
# import libraries 
import numpy as np
import pandas as pd

In [23]:
# load the preprocessed CSV data
raw_data_preprocessed = pd.read_csv('original.csv')
raw_data_preprocessed;

In [9]:
# check point copy the raw data 
data_preprocessed = raw_data_preprocessed.copy()
data_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
ID                           700 non-null int64
Reason for Absence           700 non-null int64
Date                         700 non-null object
Transportation Expense       700 non-null int64
Distance to Work             700 non-null int64
Age                          700 non-null int64
Daily Work Load Average      700 non-null float64
Body Mass Index              700 non-null int64
Education                    700 non-null int64
Children                     700 non-null int64
Pets                         700 non-null int64
Absenteeism Time in Hours    700 non-null int64
dtypes: float64(1), int64(10), object(1)
memory usage: 65.7+ KB


In [10]:
data_preprocessed = data_preprocessed.drop(['ID'],axis = 1);

In [12]:
# encode the categorical data # group similar absence reasons together
reason_columns = pd.get_dummies(data_preprocessed['Reason for Absence'], drop_first = True)
reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:,22:].max(axis=1)

In [14]:
data_preprocessed = pd.concat([reason_type_1,reason_type_2,reason_type_3,reason_type_4,data_preprocessed], axis = 1)

In [18]:
# reason1: various diseases, reason2: pregnancy reason3: poisoning reason4: light diseases
column_names = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average','Body Mass Index', 'Education', 'Children', 'Pets','Absenteeism Time in Hours']
data_preprocessed.columns = column_names

In [19]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [30]:
data_preprocessed_drop_data_reason = data_preprocessed.drop(['Reason for Absence','Date'], axis =1);

In [31]:
data_preprocessed_drop_data_reason.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2


In [151]:
data_preprocessed_drop = data_preprocessed_drop_data_reason.copy()
data_preprocessed_drop.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2


In [150]:
# target value: 0 the number of absenteeism hours is less then the average value 
# target value: 1 the number of absenteesism hours is more thatn the average value 
targets = np.where(data_preprocessed_drop['Absenteeism Time in Hours'] >data_preprocessed_drop['Absenteeism Time in Hours'].median(), 1, 0)

In [163]:
data_preprocessed=data_preprocessed_drop.drop(['Absenteeism Time in Hours'],axis =1)
data_preprocessed['targets'] = targets

In [165]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,targets
0,0,0,0,1,289,36,33,239.554,30,1,2,1,1
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,179,51,38,239.554,31,1,0,0,0
3,1,0,0,0,279,5,39,239.554,24,1,2,0,1
4,0,0,0,1,289,36,33,239.554,30,1,2,1,0


In [166]:
data_preprocessed.to_csv('data_preprocessed', index = False)

# Load data and modeling 

LinearRegression

In [167]:
# import data set
dataset = pd.read_csv('data_preprocessed')

In [168]:
# creat check point 
dataset_original = dataset.copy()

In [180]:
x= dataset_original.iloc[:,:-1].values
y= dataset_original.iloc[:,-1].values
x.shape

(700, 12)

In [199]:
# data scaling with MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
x = mms.fit_transform(x)

In [200]:
# split training and testing data
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, shuffle=True, random_state = 42)

In [208]:
from sklearn.linear_model import LogisticRegression
classifier= LogisticRegression()
classifier.fit(x_train, y_train)
classifer.score(x_train, y_train)



0.75

In [230]:
# make predictions 
y_test = classifier.predict(x_test)

In [231]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[75,  0],
       [ 0, 65]])

In [232]:
from sklearn.metrics import f1_score
f1= f1_score(y_test, y_pred)
f1

1.0

In [None]:
# fine tune the model 

In [202]:
feature_name = dataset_original.columns[:-1].values
feature_name

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [203]:
coeff_table= pd.DataFrame(columns =['feature_name'], data = feature_name)
coeff_table['coefficient'] = classifer.coef_.T
coeff_table

Unnamed: 0,feature_name,coefficient
0,Reason_1,2.273317
1,Reason_2,0.325224
2,Reason_3,2.705091
3,Reason_4,0.502401
4,Transportation Expense,1.834768
5,Distance to Work,0.030258
6,Age,-0.598151
7,Daily Work Load Average,-0.09637
8,Body Mass Index,0.495346
9,Education,0.067064


In [204]:
coeff_table['ratio'] = np.exp(coeff_table['coefficient'] )
coeff_table

Unnamed: 0,feature_name,coefficient,ratio
0,Reason_1,2.273317,9.711558
1,Reason_2,0.325224,1.384341
2,Reason_3,2.705091,14.955681
3,Reason_4,0.502401,1.652685
4,Transportation Expense,1.834768,6.263678
5,Distance to Work,0.030258,1.030721
6,Age,-0.598151,0.549827
7,Daily Work Load Average,-0.09637,0.908128
8,Body Mass Index,0.495346,1.641065
9,Education,0.067064,1.069364


In [205]:
coeff_table.sort_values('ratio', ascending = False)

Unnamed: 0,feature_name,coefficient,ratio
2,Reason_3,2.705091,14.955681
0,Reason_1,2.273317,9.711558
4,Transportation Expense,1.834768,6.263678
10,Children,1.480736,4.396181
3,Reason_4,0.502401,1.652685
8,Body Mass Index,0.495346,1.641065
1,Reason_2,0.325224,1.384341
9,Education,0.067064,1.069364
5,Distance to Work,0.030258,1.030721
7,Daily Work Load Average,-0.09637,0.908128


the feature with coefficient close to zero are not important
'Reason_2','Education','Distance to Work','Daily Work Load Average' are not important features

# Backward Elimination 

In [220]:
dataset_finetune = dataset_original.drop(['Reason_2','Education','Distance to Work','Daily Work Load Average'], axis =1)

In [221]:
x= dataset_finetune.iloc[:,:-1].values
y= dataset_finetune.iloc[:,-1].values

In [223]:
# data scaling with MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
x = mms.fit_transform(x)

In [226]:
# split training and testing data
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, shuffle=True, random_state = 0)

In [229]:
from sklearn.linear_model import LogisticRegression
classifier_finetune= LogisticRegression()
classifier_finetune.fit(x_train, y_train)
classifier_finetune.score(x_train, y_train)



0.7535714285714286

In [233]:
# make predictions 

In [234]:
y_pred = classifier_finetune.predict(x_test)

In [239]:
classifier_finetune.score(x_test, y_test)

1.0

In [235]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[75,  0],
       [ 0, 65]])

In [236]:
from sklearn.metrics import f1_score
f1= f1_score(y_test, y_pred)
f1

1.0

In [241]:
feature_name = dataset_finetune.columns[:-1].values
feature_name

array(['Reason_1', 'Reason_3', 'Reason_4', 'Transportation Expense',
       'Age', 'Body Mass Index', 'Children', 'Pets'], dtype=object)

In [246]:
coeff_table= pd.DataFrame(columns =['feature_name'], data = feature_name)
coeff_table['coefficient'] = classifier_finetune.coef_.T
coeff_table['ratio'] = np.exp(coeff_table['coefficient'])
coeff_table

Unnamed: 0,feature_name,coefficient,ratio
0,Reason_1,2.241061,9.403303
1,Reason_3,2.675908,14.52553
2,Reason_4,0.478987,1.614438
3,Transportation Expense,1.832064,6.246766
4,Age,-0.602398,0.547497
5,Body Mass Index,0.480151,1.616318
6,Children,1.477654,4.382652
7,Pets,-1.152578,0.315822


# save model 

In [249]:
import pickle

In [250]:
with open('absenteeism_model','wb') as file:
    pickle.dump(classifier_finetune, file)