In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

In [2]:
green_data = pd.read_csv('../green_consensus.csv')
hinselmann_data = pd.read_csv('../hinselmann_consensus.csv')
schiller_data = pd.read_csv('../schiller_consensus.csv')
experts = ['experts::{}'.format(i) for i in range(5)]

col = pd.concat([green_data, hinselmann_data, schiller_data])
col = col.drop(columns=experts)

X, y = col.drop(columns=['consensus']), col['consensus']
res = {}
i = 0

In [3]:
min_samples_split = np.arange(2, 25, 2)

In [4]:
for samples in min_samples_split:
    clf = DecisionTreeClassifier(min_samples_split=samples)
    accuracy = np.mean(cross_val_score(clf, X, y, cv=10))
    roc_auc = np.mean(cross_val_score(clf, X, y, scoring='roc_auc', cv=10))
    res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'min_samples_split': samples, 'transformation': 'baseline'}
    i += 1

In [5]:
max_features = np.arange(10, 62, 2)

for samples in max_features:
    clf = DecisionTreeClassifier(min_samples_split=23, max_features=samples)
    accuracy = np.mean(cross_val_score(clf, X, y, cv=10))
    roc_auc = np.mean(cross_val_score(clf, X, y, scoring='roc_auc', cv=10))
    res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'max_features': samples, 'transformation': 'baseline'}
    i += 1

In [6]:
res

{0: {'accuracy': 0.7444809341361065,
  'roc_auc': 0.6918452380952381,
  'min_samples_split': 2,
  'transformation': 'baseline'},
 1: {'accuracy': 0.7626117496807152,
  'roc_auc': 0.6507142857142857,
  'min_samples_split': 4,
  'transformation': 'baseline'},
 2: {'accuracy': 0.7272395548257617,
  'roc_auc': 0.6985289115646258,
  'min_samples_split': 6,
  'transformation': 'baseline'},
 3: {'accuracy': 0.7377257799671593,
  'roc_auc': 0.6949574829931973,
  'min_samples_split': 8,
  'transformation': 'baseline'},
 4: {'accuracy': 0.712962962962963,
  'roc_auc': 0.6787499999999999,
  'min_samples_split': 10,
  'transformation': 'baseline'},
 5: {'accuracy': 0.747824302134647,
  'roc_auc': 0.7055017006802721,
  'min_samples_split': 12,
  'transformation': 'baseline'},
 6: {'accuracy': 0.7407954752782339,
  'roc_auc': 0.7186649659863945,
  'min_samples_split': 14,
  'transformation': 'baseline'},
 7: {'accuracy': 0.7476920270023719,
  'roc_auc': 0.7430697278911564,
  'min_samples_split': 16,

In [7]:
balancer = SMOTE(random_state=42)
X, y = balancer.fit_sample(X, y)

In [8]:
for samples in min_samples_split:
    clf = DecisionTreeClassifier(min_samples_split=samples)
    accuracy = np.mean(cross_val_score(clf, X, y, cv=10))
    roc_auc = np.mean(cross_val_score(clf, X, y, scoring='roc_auc', cv=10))
    res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'min_samples_split': samples, 'transformation': 'balancing'}
    i += 1

In [9]:
max_features = np.arange(10, 62, 2)

for samples in max_features:
    clf = DecisionTreeClassifier(min_samples_split=23, max_features=samples)
    accuracy = np.mean(cross_val_score(clf, X, y, cv=10))
    roc_auc = np.mean(cross_val_score(clf, X, y, scoring='roc_auc', cv=10))
    res[i] = {'accuracy': accuracy, 'roc_auc': roc_auc, 'max_features': samples, 'transformation': 'balancing'}
    i += 1

In [10]:
filename = 'dt_results'
measures = pd.DataFrame.from_dict(res, "index")
measures.to_csv('{}.csv'.format(filename), index=False)