In [38]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pickle

raw_data = pd.read_csv('Absenteeism_data.csv')
df_hr = raw_data.copy()

# pd.options.display.max_rows = None
# pd.options.display.max_columns = None

# A couple of takeaways here, the date format is not in the typical mm/dd/yyyy format. it's dd/mm/yyyy
# The 'Absenteeism Time in Hours' column will make a good 'target' feature for machine learning
# Will probably use a regression model after pre-processing


df_hr.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [2]:
def null_columns(df):
    return(pd.isnull(df).sum())

# This dataset has no missing values, a rarity.
print("The data set has", df_hr.shape[0], "rows and", df_hr.shape[1], "columns")
print(null_columns(df_hr))

The data set has 700 rows and 12 columns
ID                           0
Reason for Absence           0
Date                         0
Transportation Expense       0
Distance to Work             0
Age                          0
Daily Work Load Average      0
Body Mass Index              0
Education                    0
Children                     0
Pets                         0
Absenteeism Time in Hours    0
dtype: int64


In [3]:
# looking for anomalies:
# Will explore 'Reason for Absence' oclumn. Looks to be the primary independent variable
df_hr.describe()

Unnamed: 0,ID,Reason for Absence,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,17.951429,19.411429,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,11.028144,8.356292,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,1.0,0.0,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,9.0,13.0,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,18.0,23.0,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,28.0,27.0,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0
max,36.0,28.0,388.0,52.0,58.0,378.884,38.0,4.0,4.0,8.0,120.0


# Pre-processing / Data Cleaning

In [4]:
# Dropping ID column (unnecessary)
df_hr = df_hr.drop(['ID'], axis = 1)

In [5]:
# Exploring 'Reason for abscence' column:
# This feature appears to be categorical nominal ; that is, it's used to categorize using numbers instead of names.

print('Min value: ', df_hr['Reason for Absence'].min())
print('Max value: ', df_hr['Reason for Absence'].max())
print('Length: ', len(df_hr['Reason for Absence'].unique()))

# There appears to be a missing value (or, a particular coded absence reason hasn't been used)
# Will need to locate it
# The missing reason number is 20
print('Missing number: ', [i for i in range(0,29) if i not in sorted(df_hr['Reason for Absence'].unique())])

Min value:  0
Max value:  28
Length:  28
Missing number:  [20]


In [6]:
# I'm going to one-hot encode this column into dummy variables.
# This will allow deeper analysis for each individual reason.
# This also assumes that an individual can only be absent for one reason, and not multiple


# Dropping reason 0 to avoid multicolinearity. 
# This is important because two or more variables that are dependant on other variables
# can be predicted with substantial accuracy. 
absent_reasons = pd.get_dummies(df_hr['Reason for Absence'], drop_first=True)
absent_reasons

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
696,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
697,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


## Bucketing/Concatenating

In [7]:
# Separating individual reasons into larger 'buckets' to reduce cardinality (see image in repo)

# The first 14 reasons are all related to various diseases
# Reasons 15-17 are related to pregnancy
# Reasons 18-21 are related to poisoning or otherwise unique circumstances
# The rest are more 'lightweight' (typically non-life threatening) reasons for absence
# such as a dental appointments, therapy, or medical consultations
reason_type_1 = absent_reasons.iloc[:,:14].max(axis=1)
reason_type_2 = absent_reasons.iloc[:,14:17].max(axis=1)
reason_type_3 = absent_reasons.iloc[:, 17:21].max(axis=1)
reason_type_4 = absent_reasons.iloc[:, 21:].max(axis=1)

In [8]:
# dropping reason for absence column because the four absence reasons replace the need for it
df_hr = df_hr.drop(['Reason for Absence'], axis=1)

# Concatenating reason buckets back into main dataframe
df_hr = pd.concat([df_hr, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)

df_hr.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [9]:
# Renaming reason column names for readability
df_hr.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [10]:
# Renaming reason column names for readability

column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours','Reason_1','Reason_2','Reason_3','Reason_4']

df_hr.columns = column_names
df_hr.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


## Re-order Columns

In [11]:
# Moving reasons to the front of the table, just like the original column was
column_names_reordered = ['Reason_1','Reason_2','Reason_3','Reason_4','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df_hr = df_hr[column_names_reordered]

df_hr.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [12]:
df_reason_mod = df_hr.copy()

df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


## Date

In [13]:
df_reason_mod['Date']

0      07/07/2015
1      14/07/2015
2      15/07/2015
3      16/07/2015
4      23/07/2015
          ...    
695    23/05/2018
696    23/05/2018
697    24/05/2018
698    24/05/2018
699    31/05/2018
Name: Date, Length: 700, dtype: object

In [14]:
# The date column is isn't in datetime format. It's also not mm/dd/yyyy syntax. It needs to be cleaned
df_reason_mod.dtypes

Reason_1                       uint8
Reason_2                       uint8
Reason_3                       uint8
Reason_4                       uint8
Date                          object
Transportation Expense         int64
Distance to Work               int64
Age                            int64
Daily Work Load Average      float64
Body Mass Index                int64
Education                      int64
Children                       int64
Pets                           int64
Absenteeism Time in Hours      int64
dtype: object

In [15]:
# Changing date column
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'], format='%d/%m/%Y')
df_reason_mod['Date']

0     2015-07-07
1     2015-07-14
2     2015-07-15
3     2015-07-16
4     2015-07-23
         ...    
695   2018-05-23
696   2018-05-23
697   2018-05-24
698   2018-05-24
699   2018-05-31
Name: Date, Length: 700, dtype: datetime64[ns]

In [16]:
# Encoding values from date column. Separating out month and dayto a separate column to improve analysis
df_reason_mod['Month'] = df_reason_mod['Date'].dt.month

# Now for weekdays
df_reason_mod['Weekday'] = df_reason_mod['Date'].dt.weekday

# Dropping date column. It's no longer needed
df_reason_mod = df_reason_mod.drop(['Date'], axis=1)

df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Weekday
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2,7,3


## The other features

The rest of the features are either integers or floats. They are also pretty intuitive:

- Transportation expense (in dollars)
- Age
- Daily work load average (in minutes)
- Body Mass Index - a reasonable indicator of health with regards to weight
- Pets - Categorical numeric
- Children - Categorical numeric

The above features will not be cleaned, but the next features will be examined more closely:

- Education

This is categorical nominal, and deserves closer examination

The education values correspond to education levels:

- 1 = High school
- 2 = Graduate
- 3 = Postgraduate
- 4 = Masters or Doctorate


In [17]:
# 83% of all people in the this data set have a high school education only
df_reason_date_mod = df_reason_mod.copy()

df_reason_date_mod['Education'].value_counts() / df_reason_date_mod.shape[0]

1    0.832857
3    0.104286
2    0.057143
4    0.005714
Name: Education, dtype: float64

In [18]:
# It'd make sense to bin the education levels by high school and beyond.
# Separating by level of graduate degree is now less important
# Mapping high school to 0 and beyond to 1, making this column binary

df_reason_date_mod['Education'] = df_reason_date_mod['Education'].map({1:0, 2:1, 3:1, 4:1})

# Verifying binning process
df_reason_date_mod['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

# Machine Learning

In [19]:
# Renaming processed data for machine learning
df_preprocessed = df_reason_date_mod.copy()

df_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Weekday
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3


In [20]:
# the median absenteeism in hours is 3 hours, so we'll make everything below 3 hours acceptable
# Everything above will be considered excessive. We will bin this feature into those two categories

print('Median time absent: ', df_preprocessed['Absenteeism Time in Hours'].median())

Median time absent:  3.0


In [21]:
# Using the median as the cutoff keeps the dataset balanced
targets = np.where(df_preprocessed['Absenteeism Time in Hours'] > 
                   df_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [22]:
# Adding a new feature based off of excessive absence
df_preprocessed['Excessive Absenteeism'] = targets

df_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Weekday,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3,0


In [23]:
# Dropping the column for absenteeism time in hours. no longer necessary
# Also came back and dropped day of week, daily work load average, and distance to work because their weights
# were insignificant in the machine learning phase
data_with_targets = df_preprocessed.drop(['Absenteeism Time in Hours', 'Weekday',
                                          'Daily Work Load Average','Distance to Work'], axis=1)

data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month,Excessive Absenteeism
0,0,0,0,1,289,33,30,0,2,1,7,1
1,0,0,0,0,118,50,31,0,1,0,7,0
2,0,0,0,1,179,38,31,0,0,0,7,0
3,1,0,0,0,279,39,24,0,2,0,7,1
4,0,0,0,1,289,33,30,0,2,1,7,0


In [24]:
# selecting inputs for a logistic regression

data_with_targets.shape

(700, 12)

In [25]:
# identifying all columns except the target feature
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7


## Standardize data

In [26]:
# creating a custom scaler to standardize certain columns, not the dummy columns.
# typically would standardize before creating dummies, but this is a good way to do so after standardization

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [27]:
# identifying all columns to scaled (excluding dummies)
columns_to_scale = ['Transportation Expense', 'Age', 'Body Mass Index','Children', 'Pets', 'Month']

In [28]:
# Instantiating scaler
data_scaler = CustomScaler(columns_to_scale)

# fitting scaler on  data
data_scaler.fit(unscaled_inputs)

# scaling inputs using the fitted scaler
scaled_inputs = data_scaler.transform(unscaled_inputs)

# checking scaled inputs. notice how the dummy variables haven't changed
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month
0,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
1,0,0,0,0,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690,0.182726
2,0,0,0,1,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690,0.182726
3,1,0,0,0,0.854936,0.405184,-0.643782,0,0.880469,-0.589690,0.182726
4,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690,-0.388293
696,1,0,0,0,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663,-0.388293
697,1,0,0,0,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690,-0.388293
698,0,0,0,1,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690,-0.388293


## Splitting into training and testing data

In [29]:
# splitting data into train and test data
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size =0.8, random_state=42)

# checking shapes
print(x_train.shape,y_train.shape)
print(x_test.shape, y_test.shape)

(560, 11) (560,)
(140, 11) (140,)


In [30]:
# instantiating model
model = LogisticRegression()

# fitting model
model.fit(x_train, y_train)

# checking accuracy. Achieved 78% accuracy
model.score(x_train, y_train)

0.7857142857142857

## Investigating accuracy

In [31]:
print('Intercept: ', model.intercept_)
print('Coefficients: ', model.coef_)

Intercept:  [-1.64027221]
Coefficients:  [[ 2.83557273  0.74464493  3.30180982  0.72379378  0.56778767 -0.245746
   0.22313725 -0.37524439  0.38169934 -0.41697904  0.10454447]]


In [32]:
# creating a summary table to interpret coefficients
feature_names = unscaled_inputs.columns.values

summary_table = pd.DataFrame(columns=['Feature name'], data=feature_names)

# adding coefficients
summary_table['Coefficient'] = np.transpose(model.coef_)

summary_table.index = summary_table.index+1

# adding intercept to summary table
summary_table.loc[0] = ['Intercept', model.intercept_[0]]

summary_table = summary_table.sort_index()

In [33]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

# sorting the table by odds ratio to see which features have the highest importance
# The higher the odds ratio, the more likely an absence is the result of the feature listed
# as you can see, the highest reasons for absence are all related to illness

# reason 1 - disease
# reason 2 - pregnancy related
# reason 3 - poison
# reason 4 - typical appointments (doctor, dentist)
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.30181,27.161752
1,Reason_1,2.835573,17.040157
2,Reason_2,0.744645,2.105694
4,Reason_4,0.723794,2.062242
5,Transportation Expense,0.567788,1.764359
9,Children,0.381699,1.464772
7,Body Mass Index,0.223137,1.249992
11,Month,0.104544,1.110205
6,Age,-0.245746,0.782121
8,Education,-0.375244,0.687121


- Reason_3, poison will clearly be the reason that causes the highest amount of absenteeism
    - 26 times more likely to be excessively absent when poisoned. This checks out
- Reason_1, this is the normal, no drama absenteeism case. You get sick, you skip work. Nothing out of the ordinary
    - 17 times more likely to be excessively absent when sick. This checks out
- Pregnancy and routine appointments both are a significant dropoff for absenteeism.
  You're not as highly likely to be excessively absent as you would be with being really sick or being poisoned.
  
  
- One thought on the negative coefficients. Take pets for example. The more pets you have, the less likely you are to take time off of work because they liklihood that someone else is helping take care of them is high

## Done with training - Testing ML Model

In [34]:
# accuracy on test data
model.score(x_test,y_test)

0.8

In [35]:
# obtaining probability
predicted_proba = model.predict_proba(x_test)


array([[0.82453125, 0.17546875],
       [0.86388229, 0.13611771],
       [0.26701688, 0.73298312],
       [0.63628572, 0.36371428],
       [0.63142719, 0.36857281],
       [0.09509862, 0.90490138],
       [0.76416552, 0.23583448],
       [0.42791206, 0.57208794],
       [0.70001231, 0.29998769],
       [0.74302451, 0.25697549],
       [0.88049846, 0.11950154],
       [0.74418477, 0.25581523],
       [0.24383266, 0.75616734],
       [0.61641329, 0.38358671],
       [0.78921523, 0.21078477],
       [0.49373167, 0.50626833],
       [0.87703347, 0.12296653],
       [0.25816677, 0.74183323],
       [0.88360364, 0.11639636],
       [0.64520806, 0.35479194],
       [0.74451016, 0.25548984],
       [0.74868229, 0.25131771],
       [0.73530685, 0.26469315],
       [0.7498256 , 0.2501744 ],
       [0.85731248, 0.14268752],
       [0.17172731, 0.82827269],
       [0.65875214, 0.34124786],
       [0.61378897, 0.38621103],
       [0.7704742 , 0.2295258 ],
       [0.66542988, 0.33457012],
       [0.

In [37]:
# probability of excessive absenteeism. If it's below 0.5 => 0
# If it's above .5 => 1
predicted_proba[:,1]

array([0.17546875, 0.13611771, 0.73298312, 0.36371428, 0.36857281,
       0.90490138, 0.23583448, 0.57208794, 0.29998769, 0.25697549,
       0.11950154, 0.25581523, 0.75616734, 0.38358671, 0.21078477,
       0.50626833, 0.12296653, 0.74183323, 0.11639636, 0.35479194,
       0.25548984, 0.25131771, 0.26469315, 0.2501744 , 0.14268752,
       0.82827269, 0.34124786, 0.38621103, 0.2295258 , 0.33457012,
       0.11591995, 0.15018346, 0.93084579, 0.89691073, 0.21579306,
       0.64640423, 0.26469315, 0.13910921, 0.81634983, 0.19030963,
       0.52118022, 0.2457433 , 0.55741514, 0.11039582, 0.20909575,
       0.68816993, 0.78876738, 0.88573659, 0.25323859, 0.11336147,
       0.25697549, 0.25892466, 0.36857281, 0.93250398, 0.14641358,
       0.20909575, 0.97349577, 0.21579306, 0.85677819, 0.2211999 ,
       0.53752441, 0.11901414, 0.45431287, 0.57208794, 0.11950154,
       0.40045484, 0.698695  , 0.07467918, 0.2058622 , 0.54224681,
       0.25697549, 0.2295258 , 0.71289936, 0.25892466, 0.13553

## Saving model for production

In [39]:
# Pickling model
with open('model', 'wb') as file:
    pickle.dump(model, file)
    
# Will also need to save scaler. Pickling scaler
with open('scaler', 'wb') as file:
    pickle.dump(data_scaler, file)