In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from scipy.io import arff
data, meta = arff.loadarff('./data/labor.arff')

In [3]:
good = 0
bad = 0

for row in data:
    status = row[-1].decode('utf-8')
    
    if 'good' == status:
        good += 1
    else:
        bad += 1

print('{} - {} (bad - good)'.format(bad, good))

20 - 37 (bad - good)


In [4]:
labor_dataframe = pd.DataFrame(data)
nominal_cols = [
    'cost-of-living-adjustment', 
    'pension', 
    'education-allowance', 
    'vacation', 
    'longterm-disability-assistance', 
    'contribution-to-dental-plan', 
    'bereavement-assistance', 
    'contribution-to-health-plan',
    'class'
]
numeric_labor_dataframe = labor_dataframe.drop(nominal_cols, axis=1) # remove nominal values

In [5]:
# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler_numeric_df = pd.DataFrame(scaler.fit_transform(numeric_labor_dataframe.values), index=numeric_labor_dataframe.index, columns=numeric_labor_dataframe.columns)

In [6]:
# Replace missing values
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer()

processed_numeric_df = pd.DataFrame(imp_mean.fit_transform(scaler_numeric_df.values), index=scaler_numeric_df.index, columns=scaler_numeric_df.columns)

In [25]:
# Classification
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

dummycl = DummyClassifier(strategy="most_frequent")
gmb = GaussianNB()
dectree = DecisionTreeClassifier()
logreg = LogisticRegression()
svc = SVC()

lst_classif = [dummycl, gmb, dectree, logreg, svc]
lst_classif_names = ['Dummy', 'GaussianNB', 'Decision tree', 'Logistic Regression', 'SVC']

y = [x.decode('utf-8') for x in labor_dataframe['class']] # byte => string conversion


# cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

for clf, name_clf in zip(lst_classif, lst_classif_names):
    scores = cross_val_score(clf, processed_numeric_df, y, cv=5)
    y_pred = cross_val_predict(clf, processed_numeric_df, y, cv=5)
    
    # https://stackoverflow.com/a/50326049
    unique_label = ['bad', 'good']
    cmtx = pd.DataFrame(
        confusion_matrix(y, y_pred, labels=unique_label), 
        index=['true:{:}'.format(x) for x in unique_label], 
        columns=['pred:{:}'.format(x) for x in unique_label]
    )
    print('--------------------')
    print("Accuracy of "+name_clf+" classifier on cross-validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print('---')
    print(cmtx)
    print('---')

--------------------
Accuracy of Dummy classifier on cross-validation: 0.65 (+/- 0.03)
---
           pred:bad  pred:good
true:bad          0         20
true:good         0         37
---
--------------------
Accuracy of GaussianNB classifier on cross-validation: 0.86 (+/- 0.15)
---
           pred:bad  pred:good
true:bad         14          6
true:good         2         35
---
--------------------
Accuracy of Decision tree classifier on cross-validation: 0.77 (+/- 0.22)
---
           pred:bad  pred:good
true:bad         15          5
true:good         8         29
---
--------------------
Accuracy of Logistic Regression classifier on cross-validation: 0.89 (+/- 0.14)
---
           pred:bad  pred:good
true:bad         16          4
true:good         2         35
---
--------------------
Accuracy of SVC classifier on cross-validation: 0.89 (+/- 0.18)
---
           pred:bad  pred:good
true:bad         16          4
true:good         2         35
---
