In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics,
        aps_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.base import clone

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'

def getData():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def getDataWithThresh():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    train = train.dropna(thresh=150)
    train = train.dropna(axis=1, thresh=train.shape[0]*0.9)
    test = test[train.columns]

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

data = {}

In [39]:
ks = np.arange(1, 70, 6)
results = {}
res_i = 0
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())

In [40]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train, X_test, y_train, y_test)
    results[res_i] = {'Baseline': res['auc'], 'k': k}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9858771929824561, 'confusion_matrix': array([[11140,    63],
       [   98,    99]]), 'sensibility': 0.5025380710659898, 'specificity': 0.9943765062929573, 'auc': 0.7484572886794736, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform'), 'score': 49630}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9864912280701754, 'confusion_matrix': array([[11170,    33],
       [  121,    76]]), 'sensibility': 0.38578680203045684, 'specificity': 0.9970543604391681, 'auc': 0.6914205812348125, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform'), 'score': 60830}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9860526315789474, 'confusion_matrix': array([[11181,    22],
       [  

In [41]:
balancers = [(RandomUnderSampler(random_state=42), 'Undersample 50-50'), (RandomUnderSampler(ratio=0.3, random_state=42), 'Undersample 70-30'), (SMOTE(ratio=1.0, random_state=42), 'SMOTE 50-50'), (SMOTE(ratio=0.3, random_state=42), 'SMOTE 70-30')]
filename = 'knn_balancing'
measures = results
i = res_i

for k in ks:

    for balancer, name in balancers:
        X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
        res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train_bal, X_test, y_train_bal, y_test)
        measures[i] = {name: res['auc'], 'k': k, }
        i += 1
        
measures = pd.DataFrame.from_dict(measures, "index")
measures.to_csv('plot_data/{}.csv'.format(filename))

#results[res_i] = {'Price': measures[0]['Price'], 'Transformation': 'Undersampling 70-30'}
#res_i += 1"""





In [44]:
measures = measures.set_index('k')
df = pd.DataFrame()
filename = 'knn_balancing'
for col in measures:
    if col != 'k':
        values = measures[col].dropna()
        if df.empty:
            df = measures[[col]].copy()
        else:
            df = pd.concat([df, values], axis=1, join='inner')
        
df.to_csv('plot_data/{}.csv'.format(filename))

In [5]:
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())

def balance(X_train, y_train):
    balancer = SMOTE(ratio=1.0, random_state=42)
    return balancer.fit_sample(X_train, y_train)

X_train, y_train = balance(X_train, y_train)

for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train, X_test, y_train, y_test)
    results[res_i] = {'SMOTE 50-50': res['score'], 'k': k}
    res_i += 1
    print(res)



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9799122807017544, 'confusion_matrix': array([[11034,   169],
       [   60,   137]]), 'sensibility': 0.6954314720812182, 'specificity': 0.9849147549763456, 'auc': 0.8401731135287819, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform'), 'score': 31690}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9648245614035088, 'confusion_matrix': array([[10833,   370],
       [   31,   166]]), 'sensibility': 0.8426395939086294, 'specificity': 0.966973132196733, 'auc': 0.9048063630526813, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform'), 'score': 19200}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9594736842105264, 'confusion_matrix': array([[10764,   439],
       [   2

In [6]:
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())

def attribute_corr(df, top):
        
    def get_redundant_pairs(df):
        '''Get diagonal and lower triangular pairs of correlation matrix'''
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5):
        au_corr = df.corr().abs().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    
    return get_top_abs_correlations(df, top)

correlated_features = attribute_corr(X_train, 7)
correlated_features = correlated_features.reset_index()
print(correlated_features)

  level_0 level_1         0
0  bv_000  cq_000  1.000000
1  bu_000  cq_000  1.000000
2  bu_000  bv_000  1.000000
3  ah_000  bg_000  0.999052
4  aa_000  bt_000  0.998433
5  bb_000  bv_000  0.997017
6  bb_000  cq_000  0.997017


In [7]:
equal_attrs = {}
all_attrs = set([])
sets = 0
for index, corr in correlated_features.iterrows():
    attr1 = corr['level_0']
    attr2 = corr['level_1']
    
    present = False
    if attr1 in equal_attrs:
        equal_attrs[attr1].append(attr1)
    for attr in equal_attrs:
        if attr1 in equal_attrs[attr] or attr2 in equal_attrs[attr]:
            equal_attrs[attr].add(attr1)
            equal_attrs[attr].add(attr2)
            present = True
            
    if not present:
        equal_attrs[sets] = set([attr1, attr2])
        sets += 1
excluded_columns = [item for item in all_attrs if item not in best_attrs]
print('Attribute sets {}'.format(equal_attrs))


Attribute sets {0: {'bu_000', 'cq_000', 'bv_000', 'bb_000'}, 1: {'bg_000', 'ah_000'}, 2: {'bt_000', 'aa_000'}}


In [8]:
X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())

sets = {0: {'bu_000', 'bb_000', 'cq_000', 'bv_000'}, 1: {'bg_000', 'ah_000'}, 2: {'aa_000', 'bt_000'}}

X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

for group_i in sets:
    group = sets[group_i]
    group_list = list(group)
    group_list.pop(0)
    pca = PCA(n_components=1).fit(X_train[group_list])
    new = pca.transform(X_train[group_list])
    new_test = pca.transform(X_test[group_list])
    
    new = pd.DataFrame(data=new, columns=['{}'.format(group_i)])
    new_test = pd.DataFrame(data=new_test, columns=['{}'.format(group_i)])
    X_train = pd.concat([X_train, new], axis=1)
    X_test = pd.concat([X_test, new_test], axis=1)
    X_train, X_test = X_train.drop(columns=group_list), X_test.drop(columns=group_list)
    
X_train, X_test = X_train.drop(columns=['cd_000']), X_test.drop(columns=['cd_000'])
X_train, y_train = balance(X_train, y_train)
#X_train, X_test = X_train.drop(columns=excluded_columns), X_test.drop(columns=excluded_columns)



In [9]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train, X_test, y_train, y_test)
    results[res_i] = {'PCA': res['score'], 'k': k}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9799122807017544, 'confusion_matrix': array([[11034,   169],
       [   60,   137]]), 'sensibility': 0.6954314720812182, 'specificity': 0.9849147549763456, 'auc': 0.8401731135287819, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform'), 'score': 31690}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9647368421052631, 'confusion_matrix': array([[10832,   371],
       [   31,   166]]), 'sensibility': 0.8426395939086294, 'specificity': 0.9668838703918593, 'auc': 0.9047617321502445, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform'), 'score': 19210}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9592982456140351, 'confusion_matrix': array([[10763,   440],
       [   

In [20]:
def normalize(X_train, X_test):
    normalizer = Normalizer().fit(X_train)

    X_train_norm = normalizer.transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    
    return X_train_norm, X_test_norm

X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())
X_train, y_train = balance(X_train, y_train)
X_train, X_test = normalize(X_train, X_test)



In [21]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train, X_test, y_train, y_test)
    results[res_i] = {'Normalization': res['score'], 'k': k}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9773684210526316, 'confusion_matrix': array([[11006,   197],
       [   61,   136]]), 'sensibility': 0.6903553299492385, 'specificity': 0.9824154244398822, 'auc': 0.8363853771945603, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform'), 'score': 32470}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9604385964912281, 'confusion_matrix': array([[10782,   421],
       [   30,   167]]), 'sensibility': 0.8477157360406091, 'specificity': 0.9624207801481746, 'auc': 0.9050682580943917, 'clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform'), 'score': 19210}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9516666666666667, 'confusion_matrix': array([[10674,   529],
       [   

In [32]:
df = pd.DataFrame()
measures = pd.DataFrame.from_dict(results, "index")
results_backup = results

In [36]:
df = pd.DataFrame()
measures = pd.DataFrame.from_dict(results, "index")
measures = measures.set_index('k')
filename = 'knn_comparison'
for col in measures:
    if col != 'k':
        values = measures[col].dropna()
        if df.empty:
            df = measures[[col]].copy()
        else:
            df = pd.concat([df, values], axis=1, join='inner')
        
df.to_csv('plot_data/{}.csv'.format(filename))