# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Data loading

In [2]:
absenteeism_at_work = pd.read_csv("../../data/absenteeism-at-work/data.csv", delimiter=";")
absenteeism_at_work

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239.554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239.554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239.554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,11,14,7,3,1,289,36,13,33,264.604,...,0,1,2,1,0,1,90,172,30,8
736,1,11,7,3,1,235,11,14,37,264.604,...,0,3,1,0,0,1,88,172,29,4
737,4,0,0,3,1,118,14,13,40,271.219,...,0,1,1,1,0,8,98,170,34,0
738,8,0,0,4,2,231,35,14,39,271.219,...,0,1,2,1,0,2,100,170,35,0


In [3]:
pd.DataFrame({
    "Column Name": absenteeism_at_work.columns,
    "Min value": absenteeism_at_work.min(),
    "Max value": absenteeism_at_work.max()
})

Unnamed: 0,Column Name,Min value,Max value
ID,ID,1.0,36.0
Reason for absence,Reason for absence,0.0,28.0
Month of absence,Month of absence,0.0,12.0
Day of the week,Day of the week,2.0,6.0
Seasons,Seasons,1.0,4.0
Transportation expense,Transportation expense,118.0,388.0
Distance from Residence to Work,Distance from Residence to Work,5.0,52.0
Service time,Service time,1.0,29.0
Age,Age,27.0,58.0
Work load Average/day,Work load Average/day,205.917,378.884


In [4]:
absenteeism_at_work.isnull().values.any()

False

# Data preprocessing

## Remove missing values

### Reason for absence
There are values marked as 0 which signals an unkown values. They could be removed. However, a category Unkown could also be useful to make prediction a bit hard and the model more robust.

### Month of absence
Some months are encoded as 0, even though the range should be from 1 to 12

In [5]:
absenteeism_at_work.drop(absenteeism_at_work[absenteeism_at_work["Month of absence"] == 0].index, inplace=True)

## Map all booleans to their data type

In [6]:
boolean_columns = ["Disciplinary failure", "Son", "Social drinker", "Social smoker", "Pet"]

In [7]:
absenteeism_at_work[boolean_columns] = absenteeism_at_work[boolean_columns].astype(bool)

## Map categorical values to string representations

In [8]:
season_mapping = {
    1: "Summer",
    2: "Autumn",
    3: "Winter",
    4: "Spring"
}
weekday_mapping = {
    1: "Sunday",
    2: "Monday",
    3: "Tuesday",
    4: "Wednesday",
    5: "Thursday",
    6: "Friday",
    7: "Saturday"
}
month_mapping = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December"
}
education_mapping = {
    1: "High scool",
    2: "Graduate",
    3: "Postgraduate",
    4: "Master and doctor"
}
absence_reason_mapping = {
    0: "Unknown",
    1: "Infectious and parasitic",
    2: "Neoplasms",
    3: "Blood and blood-forming organ",
    4: "Endocrine, nutritional and metabolic",
    5: "Mental and behavioural disorders",
    6: "Nervous system",
    7: "Eye and adnexa",
    8: "Ear and mastoid process",
    9: "Circulatory system",
    10: "Respiratory system",
    11: "Digestive system",
    12: "Skin and subcutaneous tissue",
    13: "Musculoskeletal system",
    14: "Genitourinary system",
    15: "Pregnancy, childbirth and puerperium",
    16: "Perinatal period conditions",
    17: "Congenital malformations, deformations",
    18: "Abnormal clinical symptoms",
    19: "Injury, poisoning",
    20: "Morbidity and mortality ",
    21: "Health service encounters",
    22: "Patient follow-up",
    23: "Medical consultation",
    24: "Blood donation",
    25: "Laboratory examination",
    26: "Unjustified absence",
    27: "Physiotherapy",
    28: "Dental consultation"
}

In [9]:
absenteeism_at_work["Day of the week"] = absenteeism_at_work["Day of the week"].replace(weekday_mapping).astype("category")
absenteeism_at_work["Month of absence"] = absenteeism_at_work["Month of absence"].replace(month_mapping).astype("category")
absenteeism_at_work["Seasons"] = absenteeism_at_work["Seasons"].replace(season_mapping).astype("category")
absenteeism_at_work["Education"] = absenteeism_at_work["Education"].replace(education_mapping).astype("category")
absenteeism_at_work["Reason for absence"] = absenteeism_at_work["Reason for absence"].replace(absence_reason_mapping).astype("category")

## Round workload to integer

In [10]:
absenteeism_at_work["Work load Average/day"] = absenteeism_at_work["Work load Average/day"].round(0).astype(int)

## Set id as index

In [11]:
absenteeism_at_work.set_index("ID", inplace=True)

## Show preprocessed data

In [12]:
absenteeism_at_work

Unnamed: 0_level_0,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
11,Unjustified absence,July,Tuesday,Summer,289,36,13,33,240,97,False,High scool,True,True,False,True,90,172,30,4
36,Unknown,July,Tuesday,Summer,118,13,18,50,240,97,True,High scool,True,True,False,False,98,178,31,0
3,Medical consultation,July,Wednesday,Summer,179,51,18,38,240,97,False,High scool,False,True,False,False,89,170,31,2
7,Eye and adnexa,July,Thursday,Summer,279,5,14,39,240,97,False,High scool,True,True,True,False,68,168,24,4
11,Medical consultation,July,Thursday,Summer,289,36,13,33,240,97,False,High scool,True,True,False,True,90,172,30,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,Patient follow-up,July,Wednesday,Summer,361,52,3,28,265,93,False,High scool,True,True,False,True,80,172,27,8
28,Patient follow-up,July,Wednesday,Summer,225,26,9,28,265,93,False,High scool,True,False,False,True,69,169,24,8
13,Musculoskeletal system,July,Monday,Summer,369,17,12,31,265,93,False,High scool,True,True,False,False,70,169,25,80
11,Genitourinary system,July,Tuesday,Summer,289,36,13,33,265,93,False,High scool,True,True,False,True,90,172,30,8


# Experiments

## Global parameters

In [13]:
seed = 1183743
test_size = 0.2
shuffle_train_test = True
scaler = StandardScaler()
scale_features = ["Transportation expense", "Distance from Residence to Work", "Service time", "Age", "Work load Average/day", "Hit target", "Weight", "Height", "Body mass index", "Absenteeism time in hours"]
category_encoder = OneHotEncoder()
category_features = ["Day of the week", "Month of absence", "Seasons", "Education"]

## Data

### Split features and target value

In [14]:
X = absenteeism_at_work.drop(["Reason for absence"], axis=1)
y = absenteeism_at_work["Reason for absence"]


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, shuffle=shuffle_train_test)

### Additional preprocessing for training and evaluation

In [16]:
column_transformer = make_column_transformer(
    (scaler, scale_features),
    (category_encoder, category_features),
    remainder="passthrough"
)

X_train_transformed = column_transformer.fit_transform(X_train)
X_train = pd.DataFrame(X_train_transformed, columns=column_transformer.get_feature_names_out())
X_test_transformed = column_transformer.transform(X_test)
X_test = pd.DataFrame(X_test_transformed, columns=column_transformer.get_feature_names_out())

## Random forest classifier

### Parameters

In [17]:
n_trees = 100

### Training

In [18]:
rf_classifier = RandomForestClassifier(n_estimators=n_trees, random_state=seed)
rf_classifier.fit(X_train, y_train)

### Prediction

In [19]:
y_pred = rf_classifier.predict(X_test)

### Evaluation

In [20]:
report = classification_report(y_test, y_pred)
print(report)

                                        precision    recall  f1-score   support

            Abnormal clinical symptoms       0.00      0.00      0.00         4
         Blood and blood-forming organ       0.00      0.00      0.00         0
                        Blood donation       0.00      0.00      0.00         0
                    Circulatory system       0.00      0.00      0.00         0
Congenital malformations, deformations       0.00      0.00      0.00         0
                   Dental consultation       0.67      0.75      0.71        24
                      Digestive system       0.00      0.00      0.00         4
               Ear and mastoid process       0.00      0.00      0.00         1
                        Eye and adnexa       0.00      0.00      0.00         6
                  Genitourinary system       0.40      0.40      0.40         5
             Health service encounters       0.00      0.00      0.00         2
              Infectious and parasitic 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
present_reasons = list(filter(lambda reason: y_test.str.contains(reason).any(), absence_reason_mapping.values()))
cm = confusion_matrix(y_test, y_pred)
accuracies = np.divide(cm.diagonal(), cm.sum(axis=1))
pd.Series(accuracies[~np.isnan(accuracies)], index=present_reasons)

  accuracies = np.divide(cm.diagonal(), cm.sum(axis=1))


Unknown                         0.000000
Infectious and parasitic        0.750000
Nervous system                  0.000000
Eye and adnexa                  0.000000
Ear and mastoid process         0.000000
Respiratory system              0.400000
Digestive system                0.000000
Skin and subcutaneous tissue    0.000000
Musculoskeletal system          0.181818
Genitourinary system            0.300000
Abnormal clinical symptoms      0.567568
Injury, poisoning               0.272727
Health service encounters       0.250000
Patient follow-up               0.400000
Medical consultation            0.615385
Laboratory examination          0.250000
Unjustified absence             0.000000
Physiotherapy                   0.250000
Dental consultation             1.000000
dtype: float64

## K-neares neighbors

### Parameters

In [22]:
n_neighbors = 3

### Training

In [23]:
knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
knn_classifier.fit(X_train, y_train)

### Prediction

In [24]:
y_pred = knn_classifier.predict(X_test)

### Evaluation

In [25]:
report = classification_report(y_test, y_pred)
print(report)

                                      precision    recall  f1-score   support

          Abnormal clinical symptoms       0.11      0.50      0.17         4
       Blood and blood-forming organ       0.00      0.00      0.00         0
                      Blood donation       0.00      0.00      0.00         0
                  Circulatory system       0.00      0.00      0.00         0
                 Dental consultation       0.30      0.67      0.41        24
                    Digestive system       0.00      0.00      0.00         4
             Ear and mastoid process       0.00      0.00      0.00         1
Endocrine, nutritional and metabolic       0.00      0.00      0.00         0
                      Eye and adnexa       0.00      0.00      0.00         6
                Genitourinary system       0.25      0.40      0.31         5
           Health service encounters       0.00      0.00      0.00         2
            Infectious and parasitic       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM Classifier

### Parameters

### Training

In [26]:
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

### Prediction

In [27]:
y_pred = svm_classifier.predict(X_test)

### Evaluation

In [28]:
report = classification_report(y_test, y_pred)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


                              precision    recall  f1-score   support

  Abnormal clinical symptoms       0.00      0.00      0.00         4
         Dental consultation       0.43      0.67      0.52        24
            Digestive system       0.00      0.00      0.00         4
     Ear and mastoid process       0.00      0.00      0.00         1
              Eye and adnexa       0.00      0.00      0.00         6
        Genitourinary system       0.00      0.00      0.00         5
   Health service encounters       0.00      0.00      0.00         2
    Infectious and parasitic       0.00      0.00      0.00         5
           Injury, poisoning       0.20      0.09      0.13        11
      Laboratory examination       0.00      0.00      0.00        10
        Medical consultation       0.27      0.78      0.40        37
      Musculoskeletal system       0.20      0.09      0.13        11
              Nervous system       0.00      0.00      0.00         4
           Patient 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
