### Importar as bibliotecas importantes

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
# Load Data

raw_csv_data = pd.read_csv('Data/Absenteeism-data.csv')

raw_csv_data

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
695,17,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,28,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,18,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,25,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [3]:
# Get information from dataset
raw_csv_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [4]:
#raw_csv_data.describe(include='all')
data_raw = raw_csv_data

In [5]:
#Drop ID - not relevant to regression
data_wo_id = data_raw.drop(['ID'],axis=1)

In [6]:
#data_wo_id.isnull().sum() # verify missing values
#data_no_mv = data_wo_id.dropna(axis=0) # no missing values to drop
data_no_mv = data_wo_id

In [7]:
# Classify Reasons for Absence into 4 categories
data_no_mv['Reason_1'] = np.where((data_no_mv['Reason for Absence'] >= 1) & (data_no_mv['Reason for Absence'] <= 14), 1, 0)
data_no_mv['Reason_2'] = np.where((data_no_mv['Reason for Absence'] >= 15) & (data_no_mv['Reason for Absence'] <= 17), 1, 0)
data_no_mv['Reason_3'] = np.where((data_no_mv['Reason for Absence'] >= 18) & (data_no_mv['Reason for Absence'] <= 21), 1, 0)
data_no_mv['Reason_4'] = np.where((data_no_mv['Reason for Absence'] >= 22) & (data_no_mv['Reason for Absence'] <= 28), 1, 0)

data_w_reasons = data_no_mv.drop('Reason for Absence', axis=1)

In [8]:
# Create month and day of the week
data_w_reasons['Date'] = pd.to_datetime(data_w_reasons['Date'], dayfirst=True)

data_w_reasons['Month Value'] = data_w_reasons['Date'].dt.strftime('%m')
data_w_reasons['Day of the Week'] = data_w_reasons['Date'].dt.day_of_week

data_w_dates = data_w_reasons.drop('Date', axis=1)

In [9]:
#data_w_dates['Education'].unique()

In [10]:
# Classify if employee is educated or not
data_w_educ = data_w_dates
data_w_educ['Education'] = data_w_educ['Education'].map({1:0, 2:1, 3:1, 4:1})

In [11]:
# Update column order
column_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours']

df_processed = data_w_educ[column_names_upd]
df_processed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,07,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,07,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,07,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,07,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,05,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,05,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,05,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,05,3,235,16,32,237.656,25,1,0,0,2


In [12]:
# Save preprocess data
df_processed.to_csv('Data/Absenteeism-preprocessed.csv', index=False)