In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics,
        aps_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.base import clone

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'

def getData():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def getDataWithThresh():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    train = train.dropna(thresh=150)
    train = train.dropna(axis=1, thresh=train.shape[0]*0.9)
    test = test[train.columns]

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

data = {}
clf = DecisionTreeClassifier(random_state=42)
min_samples = np.concatenate([np.arange(5, 51, 5), np.arange(100, 1001, 100)])
num_features = np.arange(10, 170, 3)
results = {}
res_i = 0

In [2]:
X_train, X_test, y_train, y_test = getDataWithThresh()
X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
def balance(X_train, y_train):
    balancer = RandomUnderSampler(ratio=0.3, random_state=42)
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    return X_train_bal, y_train_bal

X_train, y_train = balance(X_train, y_train)




In [3]:
for samples in min_samples:
    res = aps_classifier_statistics(DecisionTreeClassifier(min_samples_split=samples), X_train, X_test, y_train, y_test)
    res_test = aps_classifier_statistics(DecisionTreeClassifier(min_samples_split=samples), X_train, X_train, y_train, y_train)
    results[res_i] = {'Train': res['auc'], 'Min Samples to Split': samples}
    res_i += 1
    results[res_i] = {'Test': res_test['auc'], 'Min Samples to Split': samples}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9388596491228071, 'confusion_matrix': array([[10525,   678],
       [   19,   178]]), 'sensibility': 0.9035532994923858, 'specificity': 0.9394804962956351, 'auc': 0.9215168978940105, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'score': 16280}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9494736842105264, 'confusion_matrix': array([[10645,   558],
       [   18,   179]]), 'sensibility': 0.9086294416243654, 'specificity': 0.9501919128804784, 'auc': 0.9294106772524219, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
     

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9414912280701755, 'confusion_matrix': array([[10549,   654],
       [   13,   184]]), 'sensibility': 0.934010152284264, 'specificity': 0.9416227796126038, 'auc': 0.9378164659484339, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=500,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'score': 13040}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9420175438596491, 'confusion_matrix': array([[10558,   645],
       [   16,   181]]), 'sensibility': 0.9187817258883249, 'specificity': 0.9424261358564671, 'auc': 0.930603930872396, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
     