In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics,
        aps_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn import tree
import graphviz

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'

def getData():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def getDataWithThresh():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    train = train.dropna(thresh=150)
    train = train.dropna(axis=1, thresh=train.shape[0]*0.9)
    test = test[train.columns]

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

data = {}
clf = RandomForestClassifier(random_state=42)
min_samples = np.concatenate([np.arange(5, 51, 5), np.arange(100, 1001, 100)])
n_estimators = np.concatenate([np.arange(10, 71, 10), np.arange(100, 1001, 100)])
results = {}
res_i = 0

In [12]:
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
def balance(X_train, y_train):
    balancer = RandomUnderSampler(ratio=0.3, random_state=42)
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    return X_train_bal, y_train_bal

X_train, y_train = balance(X_train, y_train)




In [13]:
for samples in min_samples:
    res = aps_classifier_statistics(RandomForestClassifier(min_samples_split=samples, n_estimators=500), X_train, X_test, y_train, y_test)
    res_train = aps_classifier_statistics(RandomForestClassifier(min_samples_split=samples), X_train, X_train, y_train, y_train)
    results[res_i] = {'Test': res['auc'], 'Min Samples to Split': samples}
    res_i += 1
    results[res_i] = {'Train': res_train['auc'], 'Min Samples to Split': samples}
    res_i += 1
    print(res)
    print(res_train)



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9709649122807018, 'confusion_matrix': array([[10893,   310],
       [   21,   176]]), 'sensibility': 0.8934010152284264, 'specificity': 0.9723288404891547, 'auc': 0.9328649278587906, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13600}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9926301243666513, 'confusion_matrix': array([[1664,    6],
       [  10,  491]]), 'sensibility': 0.9800399201596807, 'specificity': 0.9964071856287425, 'auc': 0.9882235528942116, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9697368421052631, 'confusion_matrix': array([[10878,   325],
       [   20,   177]]), 'sensibility': 0.8984771573604061, 'specificity': 0.9709899134160492, 'auc': 0.9347335353882277, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13250}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.988945186549977, 'confusion_matrix': array([[1656,   14],
       [  10,  491]]), 'sensibility': 0.9800399201596807, 'specificity': 0.9916167664670659, 'auc': 0.9858283433133732, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9685964912280701, 'confusion_matrix': array([[10867,   336],
       [   22,   175]]), 'sensibility': 0.8883248730964467, 'specificity': 0.9700080335624386, 'auc': 0.9291664533294426, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14360}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9861814831874712, 'confusion_matrix': array([[1654,   16],
       [  14,  487]]), 'sensibility': 0.9720558882235529, 'specificity': 0.9904191616766467, 'auc': 0.9812375249500999, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9678947368421053, 'confusion_matrix': array([[10858,   345],
       [   21,   176]]), 'sensibility': 0.8934010152284264, 'specificity': 0.9692046773185754, 'auc': 0.9313028462735009, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13950}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9806540764624597, 'confusion_matrix': array([[1645,   25],
       [  17,  484]]), 'sensibility': 0.9660678642714571, 'specificity': 0.9850299401197605, 'auc': 0.9755489021956087, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9677192982456141, 'confusion_matrix': array([[10855,   348],
       [   20,   177]]), 'sensibility': 0.8984771573604061, 'specificity': 0.9689368919039543, 'auc': 0.9337070246321801, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=25,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13480}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.976508521418701, 'confusion_matrix': array([[1637,   33],
       [  18,  483]]), 'sensibility': 0.9640718562874252, 'specificity': 0.9802395209580839, 'auc': 0.9721556886227545, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9674561403508772, 'confusion_matrix': array([[10853,   350],
       [   21,   176]]), 'sensibility': 0.8934010152284264, 'specificity': 0.9687583682942069, 'auc': 0.9310796917613167, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=30,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14000}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9723629663749425, 'confusion_matrix': array([[1632,   38],
       [  22,  479]]), 'sensibility': 0.9560878243512974, 'specificity': 0.9772455089820359, 'auc': 0.9666666666666666, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9667543859649123, 'confusion_matrix': array([[10845,   358],
       [   21,   176]]), 'sensibility': 0.8934010152284264, 'specificity': 0.9680442738552174, 'auc': 0.9307226445418219, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=35,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14080}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9677567941040995, 'confusion_matrix': array([[1621,   49],
       [  21,  480]]), 'sensibility': 0.9580838323353293, 'specificity': 0.9706586826347305, 'auc': 0.9643712574850299, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.966140350877193, 'confusion_matrix': array([[10838,   365],
       [   21,   176]]), 'sensibility': 0.8934010152284264, 'specificity': 0.9674194412211015, 'auc': 0.9304102282247639, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=40,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14150}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.966835559649931, 'confusion_matrix': array([[1624,   46],
       [  26,  475]]), 'sensibility': 0.9481037924151696, 'specificity': 0.9724550898203593, 'auc': 0.9602794411177644, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9655263157894737, 'confusion_matrix': array([[10832,   371],
       [   22,   175]]), 'sensibility': 0.8883248730964467, 'specificity': 0.9668838703918593, 'auc': 0.927604371744153, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=45,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14710}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9640718562874252, 'confusion_matrix': array([[1616,   54],
       [  24,  477]]), 'sensibility': 0.9520958083832335, 'specificity': 0.9676646706586827, 'auc': 0.9598802395209581, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.965, 'confusion_matrix': array([[10825,   378],
       [   21,   176]]), 'sensibility': 0.8934010152284264, 'specificity': 0.9662590377577435, 'auc': 0.9298300264930849, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=50,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14280}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9645324735145094, 'confusion_matrix': array([[1624,   46],
       [  31,  470]]), 'sensibility': 0.93812375249501, 'specificity': 0.9724550898203593, 'auc': 0.9552894211576846, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',




{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9599122807017544, 'confusion_matrix': array([[10765,   438],
       [   19,   178]]), 'sensibility': 0.9035532994923858, 'specificity': 0.9609033294653218, 'auc': 0.9322283144788538, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13880}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9594656840165822, 'confusion_matrix': array([[1607,   63],
       [  25,  476]]), 'sensibility': 0.9500998003992016, 'specificity': 0.9622754491017964, 'auc': 0.956187624750499, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9574561403508772, 'confusion_matrix': array([[10734,   469],
       [   16,   181]]), 'sensibility': 0.9187817258883249, 'specificity': 0.9581362135142373, 'auc': 0.9384589697012811, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=200,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12690}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9608475356978351, 'confusion_matrix': array([[1610,   60],
       [  25,  476]]), 'sensibility': 0.9500998003992016, 'specificity': 0.9640718562874252, 'auc': 0.9570858283433133, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cr



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.957280701754386, 'confusion_matrix': array([[10732,   471],
       [   16,   181]]), 'sensibility': 0.9187817258883249, 'specificity': 0.9579576899044898, 'auc': 0.9383697078964073, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=300,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12710}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9613081529249194, 'confusion_matrix': array([[1605,   65],
       [  19,  482]]), 'sensibility': 0.9620758483033932, 'specificity': 0.9610778443113772, 'auc': 0.9615768463073853, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9510526315789474, 'confusion_matrix': array([[10659,   544],
       [   14,   183]]), 'sensibility': 0.9289340101522843, 'specificity': 0.9514415781487102, 'auc': 0.9401877941504972, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=400,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12440}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9553201289728236, 'confusion_matrix': array([[1591,   79],
       [  18,  483]]), 'sensibility': 0.9640718562874252, 'specificity': 0.9526946107784431, 'auc': 0.958383233532934, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.951140350877193, 'confusion_matrix': array([[10659,   544],
       [   13,   184]]), 'sensibility': 0.934010152284264, 'specificity': 0.9514415781487102, 'auc': 0.9427258652164873, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=500,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 11940}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9530170428374021, 'confusion_matrix': array([[1584,   86],
       [  16,  485]]), 'sensibility': 0.9680638722554891, 'specificity': 0.948502994011976, 'auc': 0.9582834331337325, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9520175438596491, 'confusion_matrix': array([[10670,   533],
       [   14,   183]]), 'sensibility': 0.9289340101522843, 'specificity': 0.9524234580023208, 'auc': 0.9406787340773024, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=600,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12330}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9562413634269922, 'confusion_matrix': array([[1593,   77],
       [  18,  483]]), 'sensibility': 0.9640718562874252, 'specificity': 0.9538922155688623, 'auc': 0.9589820359281437, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cr



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9515789473684211, 'confusion_matrix': array([[10665,   538],
       [   14,   183]]), 'sensibility': 0.9289340101522843, 'specificity': 0.9519771489779524, 'auc': 0.9404555795651184, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=700,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12380}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9557807461999078, 'confusion_matrix': array([[1593,   77],
       [  19,  482]]), 'sensibility': 0.9620758483033932, 'specificity': 0.9538922155688623, 'auc': 0.9579840319361278, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cr



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9507894736842105, 'confusion_matrix': array([[10655,   548],
       [   13,   184]]), 'sensibility': 0.934010152284264, 'specificity': 0.9510845309292154, 'auc': 0.9425473416067398, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=800,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 11980}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9539382772915707, 'confusion_matrix': array([[1592,   78],
       [  22,  479]]), 'sensibility': 0.9560878243512974, 'specificity': 0.9532934131736527, 'auc': 0.954690618762475, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9516666666666667, 'confusion_matrix': array([[10666,   537],
       [   14,   183]]), 'sensibility': 0.9289340101522843, 'specificity': 0.9520664107828261, 'auc': 0.9405002104675552, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=900,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12370}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9585444495624136, 'confusion_matrix': array([[1597,   73],
       [  17,  484]]), 'sensibility': 0.9660678642714571, 'specificity': 0.9562874251497006, 'auc': 0.9611776447105789, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cr



In [14]:
measures = pd.DataFrame.from_dict(results, "index")
measures = measures.set_index('Min Samples to Split')
filename = 'rf_min_samples_n_estimators'
df = pd.DataFrame()
for col in measures:
    if col != 'Min Samples to Split':
        values = measures[col].dropna()
        if df.empty:
            df = measures[[col]].copy()
        else:
            df = pd.concat([df, values], axis=1, join='inner')
        
df.to_csv('plot_data/{}.csv'.format(filename))

In [9]:
for n in n_estimators:
    res = aps_classifier_statistics(RandomForestClassifier(min_samples_split=900, n_estimators=n), X_train, X_test, y_train, y_test)
    res_train = aps_classifier_statistics(RandomForestClassifier(min_samples_split=900, n_estimators=n), X_train, X_train, y_train, y_train)
    results[res_i] = {'Test': res['auc'], 'Nmr Estimators': n}
    res_i += 1
    results[res_i] = {'Train': res_train['auc'], 'Nmr Estimators': n}
    res_i += 1
    print(res)
    print(res_train)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.953859649122807, 'confusion_matrix': array([[10692,   511],
       [   15,   182]]), 'sensibility': 0.9238578680203046, 'specificity': 0.954387217709542, 'auc': 0.9391225428649232, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=900,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12610}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9511745739290649, 'confusion_matrix': array([[1585,   85],
       [  21,  480]]), 'sensibility': 0.9580838323353293, 'specificity': 0.9491017964071856, 'auc': 0.9535928143712574, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9520175438596491, 'confusion_matrix': array([[10671,   532],
       [   15,   182]]), 'sensibility': 0.9238578680203046, 'specificity': 0.9525127198071945, 'auc': 0.9381852939137495, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=900,
            min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12820}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9548595117457392, 'confusion_matrix': array([[1586,   84],
       [  14,  487]]), 'sensibility': 0.9720558882235529, 'specificity': 0.9497005988023952, 'auc': 0.9608782435129741, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.951140350877193, 'confusion_matrix': array([[10659,   544],
       [   13,   184]]), 'sensibility': 0.934010152284264, 'specificity': 0.9514415781487102, 'auc': 0.9427258652164873, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=900,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 11940}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9562413634269922, 'confusion_matrix': array([[1589,   81],
       [  14,  487]]), 'sensibility': 0.9720558882235529, 'specificity': 0.951497005988024, 'auc': 0.9617764471057886, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite

In [10]:
measures = pd.DataFrame.from_dict(results, "index")
measures = measures.set_index('Nmr Estimators')
filename = 'rf_n_estimators'
df = pd.DataFrame()
for col in measures:
    if col != 'Nmr Estimators':
        values = measures[col].dropna()
        if df.empty:
            df = measures[[col]].copy()
        else:
            df = pd.concat([df, values], axis=1, join='inner')
        
df.to_csv('plot_data/{}.csv'.format(filename))