In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc

In [2]:
CLASS = 'consensus'

In [3]:
clf1 = BernoulliNB()
clf2 = DecisionTreeClassifier()
clf3 = KNeighborsClassifier()
clf4 = RandomForestClassifier(n_estimators=10)

base_clfs = [BernoulliNB(), DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(n_estimators=10)]


In [4]:
green_data = pd.read_csv('../green.csv')
hinselmann_data = pd.read_csv('../hinselmann.csv')
schiller_data = pd.read_csv('../schiller.csv')

data = [[green_data,'green_data'], [hinselmann_data,'hinselmann_data'], [schiller_data,'schiller_data']]


In [5]:
def classifier_statistics(clf, X_train, X_test, y_train, y_test):
    res = {}
    
    clf.fit(X_train, y_train)
    
    predicted = clf.predict(X_test)
    conf_matrix = confusion_matrix(y_test, predicted, labels=[0.0, 1.0])
    acc_score = accuracy_score(y_test, predicted)
    
    res['predicted'] = predicted
    res['accuracy'] = acc_score
    res['confusion_matrix'] = conf_matrix
    fpr, tpr, _ = roc_curve(y_test, predicted)
    roc_auc = auc(fpr, tpr)
    res['auc'] = roc_auc
    
    res['clf'] = clf
    
    return res


In [9]:
def split_dataset(dataset, y_name, missing_values=None):
    if missing_values:
        for value in missing_values:
            dataset = dataset[~dataset.eq(value).any(1)]
    
    X = dataset.iloc[:, dataset.columns != y_name]
    y = dataset[y_name]
    
    return X, y

In [10]:
res = []
for d in data:
    results_base = {}
    X, y = split_dataset(d[0], CLASS)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    for clf in base_clfs:
        clf_name = type(clf).__name__
        stats = classifier_statistics(clf, X_train, X_test, y_train, y_test)
        results_base[clf_name] = stats
    res.append(results_base)
    measures_base = {}
    i = 0
    for clf in results_base:
        clf_res = results_base[clf]
        measures_base[i] = {'Classifier': clf, 'Measure': 'auc', 'Value': clf_res['auc']}
        i += 1

In [15]:
aux = 0
for d in res:
    aux += d['BernoulliNB']['auc']

In [17]:
aux/3

0.7654320987654323