In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

In [2]:
green_data = pd.read_csv('../green_consensus.csv')
hinselmann_data = pd.read_csv('../hinselmann_consensus.csv')
schiller_data = pd.read_csv('../schiller_consensus.csv')
experts = ['experts::{}'.format(i) for i in range(5)]

col = pd.concat([green_data, hinselmann_data, schiller_data])
col = col.drop(columns=experts)

X, y = col.drop(columns=['consensus']), col['consensus']
res = {}
i = 0

In [3]:
ks = np.arange(1, 22, 2)

In [4]:
for k in ks:
    clf = KNeighborsClassifier(n_neighbors=k)
    accuracy = np.mean(cross_val_score(clf, X, y, cv=10))
    roc_auc = np.mean(cross_val_score(clf, X, y, scoring='roc_auc', cv=10))
    res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'k': k, 'transformation': 'baseline'}
    i += 1

In [5]:
def normalize(X):
    normalizer = Normalizer().fit(X)

    X_train_norm = normalizer.transform(X)
    
    return X_train_norm

X_norm = normalize(X)

In [6]:
for k in ks:
    clf = KNeighborsClassifier(n_neighbors=k)
    accuracy = np.mean(cross_val_score(clf, X_norm, y, cv=10))
    roc_auc = np.mean(cross_val_score(clf, X_norm, y, scoring='roc_auc', cv=10))
    res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'k': k, 'transformation': 'normalization'}
    i += 1

In [7]:
def getKBest(X, y, score_func=f_classif, k=20):
    k_best = SelectKBest(score_func=score_func, k=10).fit(X, y)

    idxs = k_best.get_support(indices=True)
    X = X.iloc[:,idxs]
    return X

X_best = getKBest(X, y)
X_best = normalize(X_best)

In [8]:
for k in ks:
    clf = KNeighborsClassifier(n_neighbors=k)
    accuracy = np.mean(cross_val_score(clf, X_best, y, cv=10))
    roc_auc = np.mean(cross_val_score(clf, X_best, y, scoring='roc_auc', cv=10))
    res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'k': k, 'transformation': 'normalization and feature selection'}
    i += 1

In [9]:
balancer = SMOTE(random_state=42)
X, y = balancer.fit_sample(X, y)

In [10]:
for k in ks:
    clf = KNeighborsClassifier(n_neighbors=k)
    accuracy = np.mean(cross_val_score(clf, X, y, cv=10))
    roc_auc = np.mean(cross_val_score(clf, X, y, scoring='roc_auc', cv=10))
    res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'k': k, 'transformation': 'balance'}
    i += 1

In [12]:
filename = 'knn_results'
measures = pd.DataFrame.from_dict(res, "index")
measures.to_csv('{}.csv'.format(filename), index=False)