In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics,
        aps_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn import tree
import graphviz

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'

def getData():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def getDataWithThresh():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    train = train.dropna(thresh=150)
    train = train.dropna(axis=1, thresh=train.shape[0]*0.9)
    test = test[train.columns]

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

data = {}
clf = RandomForestClassifier(random_state=42)
min_samples = np.concatenate([np.arange(5, 51, 5), np.arange(100, 1001, 100)])
num_features = np.arange(10, 170, 3)
results = {}
res_i = 0

In [2]:
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
def balance(X_train, y_train):
    balancer = RandomUnderSampler(ratio=0.3, random_state=42)
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    return X_train_bal, y_train_bal

X_train, y_train = balance(X_train, y_train)




In [3]:
for samples in min_samples:
    res = aps_classifier_statistics(RandomForestClassifier(min_samples_split=samples), X_train, X_test, y_train, y_test)
    res_train = aps_classifier_statistics(RandomForestClassifier(min_samples_split=samples), X_train, X_train, y_train, y_train)
    results[res_i] = {'Test': res['auc'], 'Min Samples to Split': samples}
    res_i += 1
    results[res_i] = {'Train': res_train['auc'], 'Min Samples to Split': samples}
    res_i += 1
    print(res)
    print(res_train)



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.969561403508772, 'confusion_matrix': array([[10881,   322],
       [   25,   172]]), 'sensibility': 0.8730964467005076, 'specificity': 0.9712576988306704, 'auc': 0.9221770727655889, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 15720}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9935513588208199, 'confusion_matrix': array([[1663,    7],
       [   7,  494]]), 'sensibility': 0.9860279441117764, 'specificity': 0.9958083832335329, 'auc': 0.9909181636726546, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criter



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.968421052631579, 'confusion_matrix': array([[10865,   338],
       [   22,   175]]), 'sensibility': 0.8883248730964467, 'specificity': 0.9698295099526912, 'auc': 0.9290771915245689, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14380}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9847996315062183, 'confusion_matrix': array([[1653,   17],
       [  16,  485]]), 'sensibility': 0.9680638722554891, 'specificity': 0.9898203592814371, 'auc': 0.978942115768463, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criter



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9674561403508772, 'confusion_matrix': array([[10854,   349],
       [   22,   175]]), 'sensibility': 0.8883248730964467, 'specificity': 0.9688476300990806, 'auc': 0.9285862515977635, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14490}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.976508521418701, 'confusion_matrix': array([[1642,   28],
       [  23,  478]]), 'sensibility': 0.9540918163672655, 'specificity': 0.9832335329341317, 'auc': 0.9686626746506986, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9678070175438597, 'confusion_matrix': array([[10860,   343],
       [   24,   173]]), 'sensibility': 0.8781725888324873, 'specificity': 0.9693832009283228, 'auc': 0.9237778948804051, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 15430}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9806540764624597, 'confusion_matrix': array([[1646,   24],
       [  18,  483]]), 'sensibility': 0.9640718562874252, 'specificity': 0.98562874251497, 'auc': 0.9748502994011975, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criter



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9656140350877193, 'confusion_matrix': array([[10831,   372],
       [   20,   177]]), 'sensibility': 0.8984771573604061, 'specificity': 0.9667946085869856, 'auc': 0.9326358829736959, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=25,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13720}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9737448180561953, 'confusion_matrix': array([[1638,   32],
       [  25,  476]]), 'sensibility': 0.9500998003992016, 'specificity': 0.9808383233532935, 'auc': 0.9654690618762476, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9680701754385965, 'confusion_matrix': array([[10858,   345],
       [   19,   178]]), 'sensibility': 0.9035532994923858, 'specificity': 0.9692046773185754, 'auc': 0.9363789884054806, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=30,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12950}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9728235836020267, 'confusion_matrix': array([[1638,   32],
       [  27,  474]]), 'sensibility': 0.9461077844311377, 'specificity': 0.9808383233532935, 'auc': 0.9634730538922156, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9647368421052631, 'confusion_matrix': array([[10824,   379],
       [   23,   174]]), 'sensibility': 0.883248730964467, 'specificity': 0.9661697759528698, 'auc': 0.9247092534586684, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=35,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 15290}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9714417319207739, 'confusion_matrix': array([[1630,   40],
       [  22,  479]]), 'sensibility': 0.9560878243512974, 'specificity': 0.9760479041916168, 'auc': 0.9660678642714572, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9619298245614035, 'confusion_matrix': array([[10789,   414],
       [   20,   177]]), 'sensibility': 0.8984771573604061, 'specificity': 0.9630456127822905, 'auc': 0.9307613850713483, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=40,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14140}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9705204974666053, 'confusion_matrix': array([[1629,   41],
       [  23,  478]]), 'sensibility': 0.9540918163672655, 'specificity': 0.9754491017964072, 'auc': 0.9647704590818362, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9642982456140351, 'confusion_matrix': array([[10819,   384],
       [   23,   174]]), 'sensibility': 0.883248730964467, 'specificity': 0.9657234669285013, 'auc': 0.9244860989464841, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=45,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 15340}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9645324735145094, 'confusion_matrix': array([[1621,   49],
       [  28,  473]]), 'sensibility': 0.9441117764471058, 'specificity': 0.9706586826347305, 'auc': 0.9573852295409181, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.962280701754386, 'confusion_matrix': array([[10794,   409],
       [   21,   176]]), 'sensibility': 0.8934010152284264, 'specificity': 0.9634919218066589, 'auc': 0.9284464685175426, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=50,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14590}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9672961768770152, 'confusion_matrix': array([[1627,   43],
       [  28,  473]]), 'sensibility': 0.9441117764471058, 'specificity': 0.974251497005988, 'auc': 0.9591816367265468, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criter



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9609649122807018, 'confusion_matrix': array([[10779,   424],
       [   21,   176]]), 'sensibility': 0.8934010152284264, 'specificity': 0.9621529947335535, 'auc': 0.9277770049809899, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14740}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9626900046061723, 'confusion_matrix': array([[1609,   61],
       [  20,  481]]), 'sensibility': 0.9600798403193613, 'specificity': 0.9634730538922156, 'auc': 0.9617764471057885, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9587719298245614, 'confusion_matrix': array([[10752,   451],
       [   19,   178]]), 'sensibility': 0.9035532994923858, 'specificity': 0.9597429260019638, 'auc': 0.9316481127471747, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=200,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 14010}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9580838323353293, 'confusion_matrix': array([[1605,   65],
       [  26,  475]]), 'sensibility': 0.9481037924151696, 'specificity': 0.9610778443113772, 'auc': 0.9545908183632734, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9564035087719298, 'confusion_matrix': array([[10724,   479],
       [   18,   179]]), 'sensibility': 0.9086294416243654, 'specificity': 0.9572435954655003, 'auc': 0.9329365185449329, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=300,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13790}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9557807461999078, 'confusion_matrix': array([[1595,   75],
       [  21,  480]]), 'sensibility': 0.9580838323353293, 'specificity': 0.9550898203592815, 'auc': 0.9565868263473054, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, cri



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9510526315789474, 'confusion_matrix': array([[10658,   545],
       [   13,   184]]), 'sensibility': 0.934010152284264, 'specificity': 0.9513523163438364, 'auc': 0.9426812343140503, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=400,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 11950}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9567019806540764, 'confusion_matrix': array([[1590,   80],
       [  14,  487]]), 'sensibility': 0.9720558882235529, 'specificity': 0.9520958083832335, 'auc': 0.9620758483033932, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9482456140350877, 'confusion_matrix': array([[10629,   574],
       [   16,   181]]), 'sensibility': 0.9187817258883249, 'specificity': 0.9487637240024993, 'auc': 0.9337727249454121, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=500,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13740}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9553201289728236, 'confusion_matrix': array([[1587,   83],
       [  14,  487]]), 'sensibility': 0.9720558882235529, 'specificity': 0.9502994011976048, 'auc': 0.961177644710579, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.951140350877193, 'confusion_matrix': array([[10660,   543],
       [   14,   183]]), 'sensibility': 0.9289340101522843, 'specificity': 0.9515308399535839, 'auc': 0.9402324250529341, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=600,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12430}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9567019806540764, 'confusion_matrix': array([[1590,   80],
       [  14,  487]]), 'sensibility': 0.9720558882235529, 'specificity': 0.9520958083832335, 'auc': 0.9620758483033932, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9521052631578948, 'confusion_matrix': array([[10674,   529],
       [   17,   180]]), 'sensibility': 0.9137055837563451, 'specificity': 0.9527805052218156, 'auc': 0.9332430444890804, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=700,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 13790}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.954398894518655, 'confusion_matrix': array([[1589,   81],
       [  18,  483]]), 'sensibility': 0.9640718562874252, 'specificity': 0.951497005988024, 'auc': 0.9577844311377246, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9514035087719298, 'confusion_matrix': array([[10663,   540],
       [   14,   183]]), 'sensibility': 0.9289340101522843, 'specificity': 0.9517986253682049, 'auc': 0.9403663177602445, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=800,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 12400}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9567019806540764, 'confusion_matrix': array([[1591,   79],
       [  15,  486]]), 'sensibility': 0.9700598802395209, 'specificity': 0.9526946107784431, 'auc': 0.961377245508982, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crit



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9535964912280702, 'confusion_matrix': array([[10687,   516],
       [   13,   184]]), 'sensibility': 0.934010152284264, 'specificity': 0.9539409086851736, 'auc': 0.9439755304847188, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=900,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'score': 11660}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9557807461999078, 'confusion_matrix': array([[1589,   81],
       [  15,  486]]), 'sensibility': 0.9700598802395209, 'specificity': 0.951497005988024, 'auc': 0.9607784431137725, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, crite

In [4]:
measures = pd.DataFrame.from_dict(results, "index")
measures = measures.set_index('Min Samples to Split')
filename = 'rf_min_samples'
df = pd.DataFrame()
for col in measures:
    if col != 'Min Samples to Split':
        values = measures[col].dropna()
        if df.empty:
            df = measures[[col]].copy()
        else:
            df = pd.concat([df, values], axis=1, join='inner')
        
df.to_csv('plot_data/{}.csv'.format(filename))

In [None]:
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
def balance(X_train, y_train):
    balancer = RandomUnderSampler(ratio=0.3, random_state=42)
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    return X_train_bal, y_train_bal

X_train, y_train = balance(X_train, y_train)

def getBaseline():
    X_train, X_test, y_train, y_test = getData()
    X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
    return X_train, X_test, y_train, y_test

def getDataFinal():
    X_train, X_test, y_train, y_test = getData()
    X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
    def balance(X_train, y_train):
        balancer = RandomUnderSampler(ratio=0.3, random_state=42)
        X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
        return X_train_bal, y_train_bal

    X_train, y_train = balance(X_train, y_train)
    return X_train, X_test, y_train, y_test