In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        train_test_split,
        classifier_statistics
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
green_data = pd.read_csv('./green.csv')
hinselmann_data = pd.read_csv('./hinselmann.csv')
schiller_data = pd.read_csv('./schiller.csv')

data = [[green_data,'green_data'], [hinselmann_data,'hinselmann_data'], [schiller_data,'schiller_data']]

In [3]:
def balance_dataset(dataset, classe):
    y = dataset[classe]
    X = dataset.drop(columns=[classe])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    sm = SMOTE(random_state=2)

    X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
    
    

    return X_train, X_test, y_train, y_test, X_train_res, y_train_res

In [4]:
X_train_green, X_test_green, y_train_green, y_test_green, X_train_res_green, y_train_res_green = balance_dataset(data[0][0], 'consensus')
X_train_hinselmann, X_test_hinselmann, y_train_hinselmann, y_test_hinselmann, X_train_res_hinselmann, y_train_res_hinselmann = balance_dataset(data[1][0], 'consensus')
X_train_schiller, X_test_schiller, y_train_schiller, y_test_schiller, X_train_res_schiller, y_train_res_schiller = balance_dataset(data[2][0], 'consensus')
naive_cls = GaussianNB()

In [5]:
res_green = classifier_statistics(naive_cls, X_train_green, X_test_green, y_train_green, y_test_green)
res_hinselmann = classifier_statistics(naive_cls, X_train_hinselmann, X_test_hinselmann, y_train_hinselmann, y_test_hinselmann)
res_schiller = classifier_statistics(naive_cls, X_train_schiller, X_test_schiller, y_train_schiller, y_test_schiller)

In [6]:
unbalenced_accuracy = (res_green['accuracy']+res_hinselmann['accuracy']+res_schiller['accuracy'])/3
unbalenced_sensibility = (res_green['sensibility']+res_hinselmann['sensibility']+res_schiller['sensibility'])/3
unbalenced_specificity = (res_green['specificity']+res_hinselmann['specificity']+res_schiller['specificity'])/3

print('unbalenced_accuracy: {}'.format(unbalenced_accuracy))
print('unbalenced_sensibility: {}'.format(unbalenced_sensibility))
print('unbalenced_specificity: {}'.format(unbalenced_specificity))

unbalenced_accuracy: 0.8055555555555557
unbalenced_sensibility: 0.8959435626102291
unbalenced_specificity: 0.4497354497354497


In [7]:
res_green_bal = classifier_statistics(naive_cls, X_train_res_green, X_test_green, y_train_res_green, y_test_green)
res_hinselmann_bal = classifier_statistics(naive_cls, X_train_res_hinselmann, X_test_hinselmann, y_train_res_hinselmann, y_test_hinselmann)
res_schiller_bal = classifier_statistics(naive_cls, X_train_res_schiller, X_test_schiller, y_train_res_schiller, y_test_schiller)

In [8]:
balenced_accuracy = (res_green_bal['accuracy']+res_hinselmann['accuracy']+res_schiller['accuracy'])/3
balenced_sensibility = (res_green_bal['sensibility']+res_hinselmann['sensibility']+res_schiller['sensibility'])/3
balenced_specificity = (res_green_bal['specificity']+res_hinselmann['specificity']+res_schiller['specificity'])/3

print('balenced_accuracy: {}'.format(balenced_accuracy))
print('balenced_sensibility: {}'.format(balenced_sensibility))
print('balenced_specificity: {}'.format(balenced_specificity))

balenced_accuracy: 0.8166666666666668
balenced_sensibility: 0.8959435626102291
balenced_specificity: 0.48677248677248675


In [9]:
green_data['hinselmann']=0
green_data['schiller']=0
hinselmann_data['hinselmann']=1
hinselmann_data['schiller']=0
schiller_data['hinselmann']=0
schiller_data['schiller']=1

In [11]:
super_table = green_data.append(hinselmann_data)
super_table = super_table.append(schiller_data)

In [12]:
X_train_super, X_test_super, y_train_super, y_test_super, X_train_res_super, y_train_res_super = balance_dataset(super_table, 'consensus')
res_super_unbal = classifier_statistics(naive_cls, X_train_super, X_test_super, y_train_super, y_test_super)
res_super_unbal


{'predicted': array([1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
        1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
        1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
        1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
        1., 0.]),
 'accuracy': 0.8735632183908046,
 'confusion_matrix': array([[18,  8],
        [ 3, 58]]),
 'sensibility': 0.9508196721311475,
 'specificity': 0.6923076923076923}

In [13]:
res_super_bal = classifier_statistics(naive_cls, X_train_res_super, X_test_super, y_train_res_super, y_test_super)
res_super_bal

{'predicted': array([1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
        1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
        1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
        1., 0.]),
 'accuracy': 0.8160919540229885,
 'confusion_matrix': array([[15, 11],
        [ 5, 56]]),
 'sensibility': 0.9180327868852459,
 'specificity': 0.5769230769230769}

In [16]:
#sns.set()
#sns.set_style("whitegrid")
#sns.despine()

#ax = sns.countplot(x="consensus", data=super_table)
#plt.savefig('super_table_class.pdf')
#plt.clf()

array([2.78843305e-01, 9.21887670e-03, 4.39031287e-02, 1.82529393e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.76137022e-02, 0.00000000e+00, 4.01256632e-03,
       2.28058563e-01, 4.67868891e-02, 1.80808675e-01, 1.28709868e+02,
       3.70323133e+01, 9.16775549e+01, 1.65742181e+02, 1.10334502e+02,
       3.37901696e+01, 7.65443323e+01, 1.44124671e+02, 1.33973995e+02,
       3.40922668e+01, 9.98817286e+01, 1.68066262e+02, 1.20141080e+02,
       4.54902515e+01, 7.46508287e+01, 1.65631332e+02, 1.14063763e+02,
       4.49073966e+01, 6.91563663e+01, 1.58971160e+02, 1.26105160e+02,
       4.42520977e+01, 8.18530627e+01, 1.70357258e+02, 4.11659004e+00,
       2.79693406e+00, 5.04715362e+01, 1.59045238e+01, 1.36289556e+02,
       3.53225270e+01, 4.03329497e+00, 2.52718287e+00, 3.06437089e+01,
       1.97073788e+01, 1.27400802e+02, 4.48176908e+01, 8.84278570e-01,
       3.15334234e-01, 7.11271866e-01, 3.92034774e-01, 5.05173646e-01,
      