In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics,
        aps_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.base import clone

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'

def getData():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def getDataWithThresh():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    train = train.dropna(thresh=150)
    train = train.dropna(axis=1, thresh=train.shape[0]*0.9)
    test = test[train.columns]

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

data = {}
clf = DecisionTreeClassifier(random_state=42)
min_samples = np.concatenate([np.arange(5, 51, 5), np.arange(100, 1001, 100)])
num_features = np.arange(10, 170, 3)
results = {}
res_i = 0

In [2]:
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
def balance(X_train, y_train):
    balancer = RandomUnderSampler(ratio=0.3, random_state=42)
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    return X_train_bal, y_train_bal

X_train, y_train = balance(X_train, y_train)




In [3]:
for samples in min_samples:
    res = aps_classifier_statistics(DecisionTreeClassifier(min_samples_split=samples), X_train, X_test, y_train, y_test)
    res_train = aps_classifier_statistics(DecisionTreeClassifier(min_samples_split=samples), X_train, X_train, y_train, y_train)
    results[res_i] = {'Test': res['auc'], 'Min Samples to Split': samples}
    res_i += 1
    results[res_i] = {'Train': res_train['auc'], 'Min Samples to Split': samples}
    res_i += 1
    print(res)
    print(res_train)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9642982456140351, 'confusion_matrix': array([[10827,   376],
       [   31,   166]]), 'sensibility': 0.8426395939086294, 'specificity': 0.9664375613674908, 'auc': 0.9045385776380601, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'score': 19260}
{'predicted': array([0, 0, 0, ..., 1, 1, 1]), 'accuracy': 0.9967756794104099, 'confusion_matrix': array([[1670,    0],
       [   7,  494]]), 'sensibility': 0.9860279441117764, 'specificity': 1.0, 'auc': 0.9930139720558881, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9625438596491228, 'confusion_matrix': array([[10804,   399],
       [   28,   169]]), 'sensibility': 0.8578680203045685, 'specificity': 0.9643845398553959, 'auc': 0.9111262800799821, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=40,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'score': 17990}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.966835559649931, 'confusion_matrix': array([[1626,   44],
       [  28,  473]]), 'sensibility': 0.9441117764471058, 'specificity': 0.9736526946107784, 'auc': 0.958882235528942, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
          

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9490350877192982, 'confusion_matrix': array([[10632,   571],
       [   10,   187]]), 'sensibility': 0.949238578680203, 'specificity': 0.9490315094171204, 'auc': 0.9491350440486618, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=500,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'score': 10710}
{'predicted': array([0, 1, 0, ..., 1, 1, 1]), 'accuracy': 0.9557807461999078, 'confusion_matrix': array([[1585,   85],
       [  11,  490]]), 'sensibility': 0.9780439121756487, 'specificity': 0.9491017964071856, 'auc': 0.9635728542914171, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
        

In [4]:
measures = pd.DataFrame.from_dict(results, "index")
measures = measures.set_index('Min Samples to Split')
filename = 'dt_min_samples'
df = pd.DataFrame()
for col in measures:
    if col != 'Min Samples to Split':
        values = measures[col].dropna()
        if df.empty:
            df = measures[[col]].copy()
        else:
            df = pd.concat([df, values], axis=1, join='inner')
        
df.to_csv('plot_data/{}.csv'.format(filename))