In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_validate

In [35]:
CLASS = 'consensus'

In [36]:
def balance_dataset(dataset, classe):
    y = dataset[classe]
    X = dataset.drop(columns=[classe])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    sm = SMOTE(random_state=2)

    X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
    
    

    return X_train, X_test, y_train, y_test, X_train_res, y_train_res

In [37]:
def getKBest(X, y, score_func=f_classif, k=10):
    k_best = SelectKBest(score_func=score_func, k=10).fit(X, y)

    idxs = k_best.get_support(indices=True)
    X = X.iloc[:,idxs]
    return X

In [38]:
def split_dataset(dataset, y_name, missing_values=None):
    if missing_values:
        for value in missing_values:
            dataset = dataset[~dataset.eq(value).any(1)]
    
    X = dataset.iloc[:, dataset.columns != y_name]
    y = dataset[y_name]
    
    return X, y

In [39]:
clf1 = BernoulliNB()
clf2 = DecisionTreeClassifier()
clf3 = KNeighborsClassifier()
clf4 = RandomForestClassifier(n_estimators=10)

base_clfs = [BernoulliNB(), DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(n_estimators=10)]


In [55]:
green_data = pd.read_csv('../green.csv')
hinselmann_data = pd.read_csv('../hinselmann.csv')
schiller_data = pd.read_csv('../schiller.csv')

data = [[green_data,'green_data'], [hinselmann_data,'hinselmann_data'], [schiller_data,'schiller_data']]

green_data['hinselmann']=0
green_data['schiller']=0
hinselmann_data['hinselmann']=1
hinselmann_data['schiller']=0
schiller_data['hinselmann']=0
schiller_data['schiller']=1

super_table = green_data.append(hinselmann_data)
super_table = super_table.append(schiller_data)

X, y = split_dataset(super_table, CLASS)

In [65]:
results = {}
scoring = ['accuracy', 'roc_auc']
for clf in base_clfs:
    clf_name = type(clf).__name__
    stats = cross_validate(clf, X, y, scoring=scoring, cv=10, return_train_score=False)
    results[clf_name] = {}
    results[clf_name]['accuracy'] = np.mean(stats['test_accuracy'])
    results[clf_name]['roc'] = np.mean(stats['test_roc_auc'])


In [66]:
results

{'BernoulliNB': {'accuracy': 0.898390804597701, 'roc': 0.9770214904143476},
 'DecisionTreeClassifier': {'accuracy': 0.8679474548440066,
  'roc': 0.8311958874458876},
 'KNeighborsClassifier': {'accuracy': 0.7519293924466338,
  'roc': 0.614612322201608},
 'RandomForestClassifier': {'accuracy': 0.8919622331691297,
  'roc': 0.9490994124922695}}