In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

In [2]:
clf = GaussianNB()

green_data = pd.read_csv('../green_consensus.csv')
hinselmann_data = pd.read_csv('../hinselmann_consensus.csv')
schiller_data = pd.read_csv('../schiller_consensus.csv')
experts = ['experts::{}'.format(i) for i in range(5)]

col = pd.concat([green_data, hinselmann_data, schiller_data])
col = col.drop(columns=experts)


green_data['green'] = 1
green_data['hinselmann']=0
green_data['schiller']=0
hinselmann_data['green']=0
hinselmann_data['hinselmann']=1
hinselmann_data['schiller']=0
schiller_data['green']=0
schiller_data['hinselmann']=0
schiller_data['schiller']=1

col_conc = pd.concat([green_data, hinselmann_data, schiller_data])
col_conc = col_conc.drop(columns=experts)

X, y = col.drop(columns=['consensus']), col['consensus']
res = {}
i = 0

In [3]:
accuracy = np.mean(cross_val_score(GaussianNB(), X, y, cv=10))
roc_auc = np.mean(cross_val_score(GaussianNB(), X, y, scoring='roc_auc', cv=10))
res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'transformation': 'baseline'}
i += 1

In [4]:
print(accuracy)
print(roc_auc)

0.7318418171866449
0.6828061224489795


In [5]:
def getKBest(X, y, score_func=f_classif, k=20):
    k_best = SelectKBest(score_func=score_func, k=10).fit(X, y)

    idxs = k_best.get_support(indices=True)
    X = X.iloc[:,idxs]
    return X

In [6]:
X_best = getKBest(X, y)

In [7]:
accuracy = np.mean(cross_val_score(GaussianNB(), X_best, y, cv=10))
roc_auc = np.mean(cross_val_score(GaussianNB(), X_best, y, scoring='roc_auc', cv=10))
res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'transformation': 'Feature Selection'}
i += 1

In [8]:
print(accuracy)
print(roc_auc)

0.787383689107827
0.7764115646258505


In [9]:
X_pca = PCA(0.95).fit_transform(X, y)

In [10]:
accuracy = np.mean(cross_val_score(GaussianNB(), X_pca, y, cv=10))
roc_auc = np.mean(cross_val_score(GaussianNB(), X_pca, y, scoring='roc_auc', cv=10))
res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'transformation': 'PCA'}
i += 1

In [11]:
print(accuracy)
print(roc_auc)

0.7207307060755338
0.5860884353741496


In [12]:
balancer = RandomUnderSampler(random_state=42, ratio=0.7)
X, y = balancer.fit_sample(X, y)



In [13]:
accuracy = np.mean(cross_val_score(GaussianNB(), X, y, cv=10))
roc_auc = np.mean(cross_val_score(GaussianNB(), X, y, scoring='roc_auc', cv=10))
res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'transformation': 'Balancing'}
i += 1

In [14]:
print(accuracy)
print(roc_auc)

0.6462280701754386
0.6168154761904762


In [16]:
filename = 'nb_results'
measures = pd.DataFrame.from_dict(res, "index")
measures.to_csv('{}.csv'.format(filename), index=False)