In [1]:
import pandas as pd # panel data 
import numpy as np

In [2]:
raw_csv_data = pd.read_csv('absenteeism_data.csv')
df = raw_csv_data.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [3]:
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [4]:
# drop the id
df = df.drop(['ID'], axis=1)

# dummy var for reason col
reason_col = pd.get_dummies(df['Reason for Absence'], drop_first=True)
df = df.drop(['Reason for Absence'], axis=1)

# group reasons
# loc - label based
reason_type_1 = reason_col.loc[:, 1:14].max(axis=1)
reason_type_2 = reason_col.loc[:, 15:17].max(axis=1)
reason_type_3 = reason_col.loc[:, 18:21].max(axis=1)
reason_type_4 = reason_col.loc[:, 22:].max(axis=1)

df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason 1', 'Reason 2', 'Reason 3', 'Reason 4']

# reorder
df.columns = column_names
column_names = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

df = df[column_names]

# date
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# month
list_months = []
for i in range(df.shape[0]):
    list_months.append(df['Date'][i].month)
df['Month'] = list_months

# day of week
def date_to_weekday(date):
    return date.weekday()
df['Day of Week'] = df['Date'].apply(date_to_weekday)

df = df.drop(['Date'], axis=1)

column_names = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month', 'Day of Week',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours']

df = df[column_names]

# education - make it 2 diff types
df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})

df_preprocessed = df.copy()

In [5]:
df_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2


In [6]:
df_preprocessed['Absenteeism Time in Hours'].median() # less than / greater than

3.0

In [7]:
targets = np.where(df_preprocessed['Absenteeism Time in Hours'] > 
                   df_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
df_preprocessed['Excessive Absenteeism'] = targets
df_preprocessed = df_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)
df_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,1
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,0


In [8]:
# df_preprocessed.iloc[:, :14].head()
unscaled_inputs = df_preprocessed.iloc[:, :-1] # exclude last col
unscaled_inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1


In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(unscaled_inputs)
scaled_inputs = scaler.transform(unscaled_inputs)

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets,
                                                    train_size=0.8, shuffle=True,
                                                   random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((560, 14), (140, 14), (560,), (140,))

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
reg = LogisticRegression()
reg.fit(x_train, y_train)

reg.score(x_train, y_train) # 77 accuracy

0.7732142857142857

In [12]:
reg.intercept_

array([-0.18068815])

In [13]:
reg.coef_

array([[ 2.1382323 ,  0.34676801,  1.53774496,  1.41207715,  0.094655  ,
        -0.15177422,  0.79576676, -0.09763212, -0.28675028, -0.01096321,
         0.28376827, -0.1477405 ,  0.4221685 , -0.36553134]])

In [14]:
features = unscaled_inputs.columns.values
summary_table = pd.DataFrame(columns=['Features'], data=features)
summary_table['Coef'] = np.transpose(reg.coef_)

summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()

summary_table['Odds_ratio'] = np.exp(summary_table.Coef)
summary_table = summary_table.sort_values('Odds_ratio', ascending=False)

summary_table

Unnamed: 0,Features,Coef,Odds_ratio
1,Reason 1,2.138232,8.484426
3,Reason 3,1.537745,4.654083
4,Reason 4,1.412077,4.104472
7,Transportation Expense,0.795767,2.21614
13,Children,0.422168,1.525266
2,Reason 2,0.346768,1.414489
11,Body Mass Index,0.283768,1.328125
5,Month,0.094655,1.09928
10,Daily Work Load Average,-0.010963,0.989097
8,Distance to Work,-0.097632,0.906983


In [15]:
reg.score(x_test, y_test)

0.7571428571428571