In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics,
        aps_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.base import clone

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'

def getData():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def getDataWithThresh():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    train = train.dropna(thresh=150)
    train = train.dropna(axis=1, thresh=train.shape[0]*0.9)
    test = test[train.columns]

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

data = {}

In [2]:
ks = np.arange(1, 70, 6)
results = {}
res_i = 0
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())

In [3]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train, X_test, y_train, y_test)
    results[res_i] = {'Baseline': res['score'], 'k': k}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9858771929824561, 'confusion_matrix': array([[11140,    63],
       [   98,    99]]), 'sensibility': 0.5025380710659898, 'specificity': 0.9943765062929573, 'auc': 0.7484572886794736, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform'), 'score': 49630}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9864912280701754, 'confusion_matrix': array([[11170,    33],
       [  121,    76]]), 'sensibility': 0.38578680203045684, 'specificity': 0.9970543604391681, 'auc': 0.6914205812348125, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform'), 'score': 60830}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9860526315789474, 'confusion_matrix': array([[11181,    22],
       [  

In [4]:
balancers = [(RandomUnderSampler(random_state=42), 'Undersample 50-50'), (RandomUnderSampler(ratio=0.3, random_state=42), 'Undersample 70-30'), (SMOTE(ratio=1.0, random_state=42), 'SMOTE 50-50'), (SMOTE(ratio=0.3, random_state=42), 'SMOTE 70-30')]
filename = 'knn_balancing'
measures = results
i = res_i

for k in ks:

    for balancer, name in balancers:
        X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
        res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train_bal, X_test, y_train_bal, y_test)
        measures[i] = {name: res['score'], 'k': k, }
        i += 1
        
measures = pd.DataFrame.from_dict(measures, "index")
measures.to_csv('plot_data/{}.csv'.format(filename))

#results[res_i] = {'Price': measures[0]['Price'], 'Transformation': 'Undersampling 70-30'}
#res_i += 1"""





In [44]:
df = pd.DataFrame()
for col in measures:
    if col != 'k':
        values = measures[col].dropna()
        if df.empty:
            df = measures[[col]].copy()
        else:
            df = pd.concat([df, values], axis=1, join='inner')
        
df.to_csv('plot_data/{}.csv'.format(filename))

k
1     49630.0
7     60830.0
13    68720.0
19    71210.0
25    71670.0
31    74170.0
37    76180.0
43    79190.0
49    81680.0
55    82670.0
61    81640.0
67    83630.0
Name: Baseline, dtype: float64
k
1     16090.0
7     17570.0
13    15560.0
19    16470.0
25    17420.0
31    17370.0
37    18280.0
43    19160.0
49    21350.0
55    21300.0
61    21740.0
67    21700.0
Name: Undersample 50-50, dtype: float64
k
1     18440.0
7     21100.0
13    22110.0
19    23500.0
25    24440.0
31    25470.0
37    24430.0
43    25350.0
49    24390.0
55    24370.0
61    25330.0
67    25370.0
Name: Undersample 70-30, dtype: float64
k
1     31690.0
7     19200.0
13    15890.0
19    14310.0
25    14210.0
31    13460.0
37    13480.0
43    13580.0
49    13200.0
55    13190.0
61    13250.0
67    13710.0
Name: SMOTE 50-50, dtype: float64
k
1     33990.0
7     22170.0
13    20180.0
19    16250.0
25    16860.0
31    16370.0
37    16480.0
43    16000.0
49    16950.0
55    16480.0
61    16940.0
67    16460.0
Name: