testing supertable with KBest, balanced and CV

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [6]:
CLASS = 'consensus'

In [7]:
def balance_dataset(dataset, classe):
    y = dataset[classe]
    X = dataset.drop(columns=[classe])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    sm = SMOTE(random_state=2)

    X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
    
    

    return X_train, X_test, y_train, y_test, X_train_res, y_train_res

In [8]:
def getKBest(X, y, score_func=f_classif, k=10):
    k_best = SelectKBest(score_func=score_func, k=10).fit(X, y)

    idxs = k_best.get_support(indices=True)
    X = X.iloc[:,idxs]
    return X

In [9]:
def split_dataset(dataset, y_name, missing_values=None):
    if missing_values:
        for value in missing_values:
            dataset = dataset[~dataset.eq(value).any(1)]
    
    X = dataset.iloc[:, dataset.columns != y_name]
    y = dataset[y_name]
    
    return X, y

In [10]:
def classifier_statistics(clf, X_train, X_test, y_train, y_test):
    res = {}
    
    clf.fit(X_train, y_train)
    
    predicted = clf.predict(X_test)
    conf_matrix = confusion_matrix(y_test, predicted, labels=[0.0, 1.0])
    acc_score = accuracy_score(y_test, predicted)
    
    res['predicted'] = predicted
    res['accuracy'] = acc_score
    res['confusion_matrix'] = conf_matrix
    fpr, tpr, _ = roc_curve(y_test, predicted)
    roc_auc = auc(fpr, tpr)
    res['auc'] = roc_auc
    
    res['clf'] = clf
    
    return res


In [11]:
def normalize(X_train, X_test):
    normalizer = Normalizer().fit(X_train)

    X_train_norm = normalizer.transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    
    return X_train_norm, X_test_norm


In [12]:
clf1 = BernoulliNB()
clf2 = DecisionTreeClassifier()
clf3 = KNeighborsClassifier()
clf4 = RandomForestClassifier(n_estimators=10)

base_clfs = [BernoulliNB(), DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(n_estimators=10)]


In [13]:
green_data = pd.read_csv('../green.csv')
hinselmann_data = pd.read_csv('../hinselmann.csv')
schiller_data = pd.read_csv('../schiller.csv')

data = [[green_data,'green_data'], [hinselmann_data,'hinselmann_data'], [schiller_data,'schiller_data']]

green_data['hinselmann']=0
green_data['schiller']=0
hinselmann_data['hinselmann']=1
hinselmann_data['schiller']=0
schiller_data['hinselmann']=0
schiller_data['schiller']=1

super_table = green_data.append(hinselmann_data)
super_table = super_table.append(schiller_data)

X, y = split_dataset(super_table, CLASS)

In [14]:
"""results = {}
scoring = ['accuracy', 'roc_auc']
for clf in base_clfs:
    clf_name = type(clf).__name__
    stats = cross_validate(clf, X, y, scoring=scoring, cv=10, return_train_score=False)
    results[clf_name] = {}
    results[clf_name]['accuracy'] = np.mean(stats['test_accuracy'])
    results[clf_name]['roc'] = np.mean(stats['test_roc_auc'])
"""

"results = {}\nscoring = ['accuracy', 'roc_auc']\nfor clf in base_clfs:\n    clf_name = type(clf).__name__\n    stats = cross_validate(clf, X, y, scoring=scoring, cv=10, return_train_score=False)\n    results[clf_name] = {}\n    results[clf_name]['accuracy'] = np.mean(stats['test_accuracy'])\n    results[clf_name]['roc'] = np.mean(stats['test_roc_auc'])\n"

let's test unbalanced super dataset

In [15]:
results_unbal = {}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
for clf in base_clfs:
    clf_name = type(clf).__name__
    stats = classifier_statistics(clf, X_train, X_test, y_train, y_test)
    results_unbal[clf_name] = stats

measures_unbal = {}
i = 0
for clf in results_unbal:
    clf_res = results_unbal[clf]
    measures_unbal[i] = {'Classifier': clf, 'Measure': 'auc', 'Value': clf_res['auc']}
    i += 1

In [16]:
measures_unbal

{0: {'Classifier': 'BernoulliNB',
  'Measure': 'auc',
  'Value': 0.780264817150063},
 1: {'Classifier': 'DecisionTreeClassifier',
  'Measure': 'auc',
  'Value': 0.8215636822194199},
 2: {'Classifier': 'KNeighborsClassifier',
  'Measure': 'auc',
  'Value': 0.6018284993694829},
 3: {'Classifier': 'RandomForestClassifier',
  'Measure': 'auc',
  'Value': 0.8792559899117276}}

now let's explore balanced dataset

In [17]:
X_train_super, X_test_super, y_train_super, y_test_super, X_train_res_super, y_train_res_super = balance_dataset(super_table, 'consensus')
results_bal = {}

for clf in base_clfs:
    clf_name = type(clf).__name__
    stats = classifier_statistics(clf, X_train_res_super, X_test_super, y_train_res_super, y_test_super)
    results_bal[clf_name] = stats

measures_bal = {}
i = 0
for clf in results_bal:
    clf_res = results_bal[clf]
    measures_bal[i] = {'Classifier': clf, 'Measure': 'auc', 'Value': clf_res['auc']}
    i += 1


In [18]:
measures_bal

{0: {'Classifier': 'BernoulliNB',
  'Measure': 'auc',
  'Value': 0.917717528373266},
 1: {'Classifier': 'DecisionTreeClassifier',
  'Measure': 'auc',
  'Value': 0.8215636822194199},
 2: {'Classifier': 'KNeighborsClassifier',
  'Measure': 'auc',
  'Value': 0.599936948297604},
 3: {'Classifier': 'RandomForestClassifier',
  'Measure': 'auc',
  'Value': 0.8325977301387137}}

explorar 3 datasets juntos + SMOTE + normalização

In [19]:
X_train_super, X_test_super, y_train_super, y_test_super, X_train_res_super, y_train_res_super = balance_dataset(super_table, 'consensus')
X_train_norm, X_test_norm = normalize(X_train_res_super, X_test_super)

results_nor = {}

for clf in base_clfs:
    clf_name = type(clf).__name__
    stats = classifier_statistics(clf, X_train_norm, X_test_norm, y_train_res_super, y_test_super)
    results_nor[clf_name] = stats

measures_nor = {}
i = 0
for clf in results_nor:
    clf_res = results_nor[clf]
    measures_nor[i] = {'Classifier': clf, 'Measure': 'auc', 'Value': clf_res['auc']}
    i += 1


In [20]:
measures_nor

{0: {'Classifier': 'BernoulliNB',
  'Measure': 'auc',
  'Value': 0.917717528373266},
 1: {'Classifier': 'DecisionTreeClassifier',
  'Measure': 'auc',
  'Value': 0.7503152585119799},
 2: {'Classifier': 'KNeighborsClassifier',
  'Measure': 'auc',
  'Value': 0.6273644388398487},
 3: {'Classifier': 'RandomForestClassifier',
  'Measure': 'auc',
  'Value': 0.8080075662042876}}

In [21]:
clf1 = BernoulliNB()
clf2 = DecisionTreeClassifier()
clf3 = KNeighborsClassifier()
clf4 = RandomForestClassifier(n_estimators=10)
eclf1 = VotingClassifier(estimators=[('nb', clf1), ('bt', clf2), ('knn', clf3), ('rf', clf4)], voting='hard')

base_clfs = [BernoulliNB(), DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(n_estimators=10), eclf1]


In [22]:
X = getKBest(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

sm = SMOTE(random_state=2)

X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

results_kbest = {}

for clf in base_clfs:
    clf_name = type(clf).__name__
    stats = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test)
    results_kbest[clf_name] = stats

measures_kbest = {}
i = 0
for clf in results_kbest:
    clf_res = results_kbest[clf]
    measures_kbest[i] = {'Classifier': clf, 'Measure': 'auc', 'Value': clf_res['auc']}
    i += 1



In [23]:
measures_kbest

{0: {'Classifier': 'BernoulliNB',
  'Measure': 'auc',
  'Value': 0.9508196721311475},
 1: {'Classifier': 'DecisionTreeClassifier',
  'Measure': 'auc',
  'Value': 0.8407944514501892},
 2: {'Classifier': 'KNeighborsClassifier',
  'Measure': 'auc',
  'Value': 0.6793820933165196},
 3: {'Classifier': 'RandomForestClassifier',
  'Measure': 'auc',
  'Value': 0.9451450189155108},
 4: {'Classifier': 'VotingClassifier',
  'Measure': 'auc',
  'Value': 0.882093316519546}}

In [24]:
results_overfit = {}
i = 0
n_estimators = 10
min_samples_split = 2
while min_samples_split < 100:
    clf = RandomForestClassifier(n_estimators=n_estimators, min_samples_split = min_samples_split, random_state=2)
    stats_train = classifier_statistics(clf, X_train_res, X_train_res, y_train_res, y_train_res)
    stats_test = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test)
    results_overfit[min_samples_split] = {'train': stats_train['auc'], 'test': stats_test['auc']}
    i += 1
    #n_estimators += 1
    min_samples_split += 2
results_overfit = pd.DataFrame.from_dict(results_overfit, "index")
results_overfit.to_csv('../plot_data/{}.csv'.format('overfiting_test_9'))

    

In [25]:
results_overfit_knn = {}
n = 3
while n <= 100:
    clf = KNeighborsClassifier(n_neighbors=n)
    stats_train = classifier_statistics(clf, X_train_res, X_train_res, y_train_res, y_train_res)
    stats_test = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test)
    results_overfit_knn[n] = {'train': stats_train['auc'], 'test': stats_test['auc']}
    n+=2
results_overfit = pd.DataFrame.from_dict(results_overfit_knn, "index")
results_overfit.to_csv('../plot_data/{}.csv'.format('overfiting_test_8'))

In [32]:
param_RF = dict(n_estimators=list(range(3, 40)), max_depth=list(range(4, 30)), min_samples_split=list(range(10, 100)))
clf_RF = RandomForestClassifier(random_state=42)

rand_RF = RandomizedSearchCV(clf_RF, param_RF, cv=10, scoring='roc_auc', 
                             n_iter=10, random_state=5, return_train_score=False)

rand_RF.fit(X_train_res, y_train_res)
r = pd.DataFrame(rand_RF.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
r.to_csv('../plot_data/{}.csv'.format('RandomizedSearchCV'))


In [56]:
results_depth = {}
i = 0
min_samples_split = 2
max_depth = 1
while min_samples_split < 70:
    clf = DecisionTreeClassifier(min_samples_split=min_samples_split, max_depth=max_depth, random_state=2)
    stats_train = classifier_statistics(clf, X_train_res, X_train_res, y_train_res, y_train_res)
    stats_test = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test)
    results_depth[stats_train['clf'].tree_.node_count] = {'train': stats_train['auc'], 'test': stats_test['auc'] }
    min_samples_split += 1
    if max_depth and max_depth < 3:
        max_depth += 1
    else:
        max_depth = None
results_overfit = pd.DataFrame.from_dict(results_depth, "index")
results_overfit.to_csv('../plot_data/{}.csv'.format('tree_size'))

    