In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# functions

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

def cross_validation(X, y, clf_list = (), cv = 5):
    for clf, name_clf in clf_list:
        scores = cross_val_score(clf, X, y, cv=cv)
        y_pred = cross_val_predict(clf, X, y, cv=cv)
        
        # https://stackoverflow.com/a/50326049
        unique_label = ['bad', 'good']
        cmtx = pd.DataFrame(
            confusion_matrix(y, y_pred, labels=unique_label), 
            index=['true:{:}'.format(x) for x in unique_label], 
            columns=['pred:{:}'.format(x) for x in unique_label]
        )
        print('--------------------')
        print("Accuracy of "+name_clf+" classifier on cross-validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        print('---')
        print(cmtx)
        print('---')

In [3]:
from scipy.io import arff
data, meta = arff.loadarff('./data/labor.arff')

In [4]:
good = 0
bad = 0

for row in data:
    status = row[-1].decode('utf-8')
    
    if 'good' == status:
        good += 1
    else:
        bad += 1

print('{} - {} (bad - good)'.format(bad, good))

20 - 37 (bad - good)


In [5]:
labor_dataframe = pd.DataFrame(data)
nominal_cols = [
    'cost-of-living-adjustment', 
    'pension', 
    'education-allowance', 
    'vacation', 
    'longterm-disability-assistance', 
    'contribution-to-dental-plan', 
    'bereavement-assistance', 
    'contribution-to-health-plan',
    'class'
]
numeric_labor_dataframe = labor_dataframe.drop(nominal_cols, axis=1) # remove nominal values

In [6]:
# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler_numeric_df = pd.DataFrame(scaler.fit_transform(numeric_labor_dataframe.values), index=numeric_labor_dataframe.index, columns=numeric_labor_dataframe.columns)

In [7]:
# Replace missing values
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer()

processed_numeric_df = pd.DataFrame(imp_mean.fit_transform(scaler_numeric_df.values), index=scaler_numeric_df.index, columns=scaler_numeric_df.columns)

In [8]:
# Classification
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

dummycl = DummyClassifier(strategy="most_frequent")
gmb = GaussianNB()
dectree = DecisionTreeClassifier()
logreg = LogisticRegression()
svc = SVC()

y = [x.decode('utf-8') for x in labor_dataframe['class']] # byte => string conversion

clf = (
    (dummycl, 'Dummy'),
    (gmb, 'GaussianNB'),
    (dectree, 'Decision tree'),
    (logreg, 'Logistic Regresion'),
    (svc, 'SVC')
)

cross_validation(processed_numeric_df, y, clf)

--------------------
Accuracy of Dummy classifier on cross-validation: 0.65 (+/- 0.03)
---
           pred:bad  pred:good
true:bad          0         20
true:good         0         37
---
--------------------
Accuracy of GaussianNB classifier on cross-validation: 0.86 (+/- 0.15)
---
           pred:bad  pred:good
true:bad         14          6
true:good         2         35
---
--------------------
Accuracy of Decision tree classifier on cross-validation: 0.77 (+/- 0.22)
---
           pred:bad  pred:good
true:bad         15          5
true:good         8         29
---
--------------------
Accuracy of Logistic Regresion classifier on cross-validation: 0.89 (+/- 0.14)
---
           pred:bad  pred:good
true:bad         16          4
true:good         2         35
---
--------------------
Accuracy of SVC classifier on cross-validation: 0.89 (+/- 0.18)
---
           pred:bad  pred:good
true:bad         16          4
true:good         2         35
---


In [9]:
# Prepare categorical dataframe
categorical_labor_df = labor_dataframe[nominal_cols]
categorical_labor_df.drop(['class'], axis=1, inplace=True) # remove class as it is our y
categorical_labor_df = categorical_labor_df.applymap(lambda x: x.decode('utf-8')) # byte => string

# Replace missing values 
imp_most_frequent = SimpleImputer(strategy='most_frequent', missing_values='?')
processed_categorical_labor_df = pd.DataFrame(imp_most_frequent.fit_transform(categorical_labor_df.values), index=categorical_labor_df.index,columns=categorical_labor_df.columns)

# Discretization
categorical_labor_onehot_df = pd.get_dummies(processed_categorical_labor_df)

In [11]:
# Classification with categorical data
cross_validation(categorical_labor_onehot_df, y, clf)

--------------------
Accuracy of Dummy classifier on cross-validation: 0.65 (+/- 0.03)
---
           pred:bad  pred:good
true:bad          0         20
true:good         0         37
---
--------------------
Accuracy of GaussianNB classifier on cross-validation: 0.90 (+/- 0.13)
---
           pred:bad  pred:good
true:bad         16          4
true:good         2         35
---
--------------------
Accuracy of Decision tree classifier on cross-validation: 0.84 (+/- 0.25)
---
           pred:bad  pred:good
true:bad         15          5
true:good         5         32
---
--------------------
Accuracy of Logistic Regresion classifier on cross-validation: 0.91 (+/- 0.20)
---
           pred:bad  pred:good
true:bad         15          5
true:good         0         37
---
--------------------
Accuracy of SVC classifier on cross-validation: 0.91 (+/- 0.20)
---
           pred:bad  pred:good
true:bad         15          5
true:good         0         37
---


In [10]:
# Use all available attributes to train classifiers
numerical_cols = [
    'duration',
    'wage-increase-first-year',
    'wage-increase-second-year',
    'wage-increase-third-year',
    'working-hours',
    'shift-differential',
    'statutory-holidays',
    'standby-pay'
]

full_labor_df = labor_dataframe.copy()

# Preprocess nominal cols
full_labor_df[nominal_cols] = full_labor_df[nominal_cols].applymap(lambda x: x.decode('utf-8')) # byte => string
full_labor_df[nominal_cols] = imp_most_frequent.fit_transform(full_labor_df[nominal_cols])

# Preprocess numerical cols
full_labor_df[numerical_cols] = scaler.fit_transform(full_labor_df[numerical_cols])
full_labor_df[numerical_cols] = imp_mean.fit_transform(full_labor_df[numerical_cols])

# Remove y column
full_labor_df.drop(['class'], axis=1, inplace=True)

# One-hot nominal values
full_labor_onehot_df = pd.get_dummies(full_labor_df, columns=nominal_cols.remove('class')) # class column has been removed from dataframe previously

In [12]:
cross_validation(full_labor_onehot_df, y, clf)

--------------------
Accuracy of Dummy classifier on cross-validation: 0.65 (+/- 0.03)
---
           pred:bad  pred:good
true:bad          0         20
true:good         0         37
---
--------------------
Accuracy of GaussianNB classifier on cross-validation: 0.90 (+/- 0.13)
---
           pred:bad  pred:good
true:bad         16          4
true:good         2         35
---
--------------------
Accuracy of Decision tree classifier on cross-validation: 0.79 (+/- 0.28)
---
           pred:bad  pred:good
true:bad         16          4
true:good         5         32
---
--------------------
Accuracy of Logistic Regresion classifier on cross-validation: 0.95 (+/- 0.15)
---
           pred:bad  pred:good
true:bad         18          2
true:good         1         36
---
--------------------
Accuracy of SVC classifier on cross-validation: 0.96 (+/- 0.09)
---
           pred:bad  pred:good
true:bad         18          2
true:good         0         37
---
