In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'
train = pd.read_csv('./aps_failure_training_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')
test = pd.read_csv('./aps_failure_test_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')

X_train, y_train = split_dataset(train, CLASS)
X_test, y_test = split_dataset(test, CLASS)
y_train = y_train.map({'pos': 1, 'neg': 0})
y_test = y_test.map({'pos': 1, 'neg': 0})

X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

def normalize(X_train, X_test):
    normalizer = Normalizer().fit(X_train)

    X_train_norm = normalizer.transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    
    return X_train_norm, X_test_norm

aps = pd.concat([X_train, X_test])

clf = KNeighborsClassifier()
ks = np.arange(11, 102, 10)
results = {}
res_i = 0

In [16]:
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())

In [27]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train, X_test, y_train, y_test)
    results[res_i] = {'Price': res['score'], 'Transformation': 'Baseline', 'k': k}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.982625, 'confusion_matrix': array([[15595,    30],
       [  248,   127]]), 'sensibility': 0.33866666666666667, 'specificity': 0.99808, 'score': 124300}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9815625, 'confusion_matrix': array([[15600,    25],
       [  270,   105]]), 'sensibility': 0.28, 'specificity': 0.9984, 'score': 135250}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9805, 'confusion_matrix': array([[15602,    23],
       [  289,    86]]), 'sensibility': 0.22933333333333333, 'specificity': 0.998528, 'score': 144730}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.979625, 'confusion_matrix': array([[15599,    26],
       [  300,    75]]), 'sensibility': 0.2, 'specificity': 0.998336, 'score': 150260}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9793125, 'confusion_matrix': array([[15596,    29],
       [  302,    73]]), 'sensibility': 0.19466666666666665, 'speci

In [4]:
"""balancers = [(RandomUnderSampler(random_state=42), 'Undersample 50-50'), (RandomUnderSampler(ratio=0.3, random_state=42), 'Undersample 70-30'), (SMOTE(ratio=1.0, random_state=42), 'SMOTE 50-50'), (SMOTE(ratio=0.3, random_state=42), 'SMOTE 70-30')]
filename = 'knn_balancing'
measures = {}
i = 0

for k in ks:

    for balancer, name in balancers:
        X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
        res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train_bal, X_test, y_train_bal, y_test)
        measures[i] = {'Price': res['score'], 'Technique': name, 'k': k}
        i += 1
        
plt.figure(figsize=(14,6))
measures = pd.DataFrame.from_dict(measures, "index")
measures.to_csv('plot_data/{}.csv'.format(filename))
    
g = sns.FacetGrid(measures, hue="Technique", size=8)
g = g.map(plt.scatter, "k", "Price").add_legend()
g = g.map(plt.plot, "k", "Price")
g.axes[0,0].set_ylim(ymin=0)

plt.savefig('images/{}.pdf'.format(filename))
plt.clf()"""

#results[res_i] = {'Price': measures[0]['Price'], 'Transformation': 'Undersampling 70-30'}
#res_i += 1"""

'balancers = [(RandomUnderSampler(random_state=42), \'Undersample 50-50\'), (RandomUnderSampler(ratio=0.3, random_state=42), \'Undersample 70-30\'), (SMOTE(ratio=1.0, random_state=42), \'SMOTE 50-50\'), (SMOTE(ratio=0.3, random_state=42), \'SMOTE 70-30\')]\nfilename = \'knn_balancing\'\nmeasures = {}\ni = 0\n\nfor k in ks:\n\n    for balancer, name in balancers:\n        X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)\n        res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train_bal, X_test, y_train_bal, y_test)\n        measures[i] = {\'Price\': res[\'score\'], \'Technique\': name, \'k\': k}\n        i += 1\n        \nplt.figure(figsize=(14,6))\nmeasures = pd.DataFrame.from_dict(measures, "index")\nmeasures.to_csv(\'plot_data/{}.csv\'.format(filename))\n    \ng = sns.FacetGrid(measures, hue="Technique", size=8)\ng = g.map(plt.scatter, "k", "Price").add_legend()\ng = g.map(plt.plot, "k", "Price")\ng.axes[0,0].set_ylim(ymin=0)\n\nplt.savefig(\'im

In [17]:
def balance(X_train, y_train):
    balancer = SMOTE(ratio=1.0, random_state=42)
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    return X_train_bal, y_train_bal

X_train_bal, y_train_bal = balance(X_train, y_train)



In [6]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train_bal, X_test, y_train_bal, y_test)
    results[res_i] = {'Price': res['score'], 'Transformation': 'SMOTE 50-50', 'k':k}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9665625, 'confusion_matrix': array([[15127,   498],
       [   37,   338]]), 'sensibility': 0.9013333333333333, 'specificity': 0.968128, 'score': 23480}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9606875, 'confusion_matrix': array([[15021,   604],
       [   25,   350]]), 'sensibility': 0.9333333333333333, 'specificity': 0.961344, 'score': 18540}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.956125, 'confusion_matrix': array([[14941,   684],
       [   18,   357]]), 'sensibility': 0.952, 'specificity': 0.956224, 'score': 15840}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9533125, 'confusion_matrix': array([[14895,   730],
       [   17,   358]]), 'sensibility': 0.9546666666666667, 'specificity': 0.95328, 'score': 15800}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9508125, 'confusion_matrix': array([[14854,   771],
       [   16,   359]]), 'sensibility': 0.957333333

In [8]:
print(results)

{0: {'Price': 116890, 'Transformation': 'Baseline', 'k': 11}, 1: {'Price': 128840, 'Transformation': 'Baseline', 'k': 21}, 2: {'Price': 136830, 'Transformation': 'Baseline', 'k': 31}, 3: {'Price': 142330, 'Transformation': 'Baseline', 'k': 41}, 4: {'Price': 146840, 'Transformation': 'Baseline', 'k': 51}, 5: {'Price': 154330, 'Transformation': 'Baseline', 'k': 61}, 6: {'Price': 157300, 'Transformation': 'Baseline', 'k': 71}, 7: {'Price': 159260, 'Transformation': 'Baseline', 'k': 81}, 8: {'Price': 161760, 'Transformation': 'Baseline', 'k': 91}, 9: {'Price': 162730, 'Transformation': 'Baseline', 'k': 101}, 10: {'Price': 23480, 'Transformation': 'SMOTE 50-50', 'k': 11}, 11: {'Price': 18540, 'Transformation': 'SMOTE 50-50', 'k': 21}, 12: {'Price': 15840, 'Transformation': 'SMOTE 50-50', 'k': 31}, 13: {'Price': 15800, 'Transformation': 'SMOTE 50-50', 'k': 41}, 14: {'Price': 15710, 'Transformation': 'SMOTE 50-50', 'k': 51}, 15: {'Price': 16040, 'Transformation': 'SMOTE 50-50', 'k': 61}, 16: 

Drop cd_000. THe results didn't get better, stayed the same

In [20]:
X_train_pca, X_test_pca = X_train.drop(columns=['cd_000']), X_test.drop(columns=['cd_000'])
sets = {0: {'bb_000', 'bv_000', 'bu_000', 'cq_000'}, 1: {'ah_000', 'bg_000'}, 2: {'bt_000', 'aa_000'}, 3: {'cf_000', 'co_000', 'ad_000'}}

for group_i in sets:
    group = sets[group_i]
    group_list = list(group)
    pca = PCA(n_components=1).fit(X_train_pca[group_list])
    new = pca.transform(X_train_pca[group_list])
    new_test = pca.transform(X_test_pca[group_list])
    
    new = pd.DataFrame(data=new)
    new_test = pd.DataFrame(data=new_test)
    X_train_pca = pd.concat([X_train_pca, new], axis=1)
    X_test_pca = pd.concat([X_test_pca, new_test], axis=1)
    X_train_pca, X_test_pca = X_train_pca.drop(columns=group_list), X_test_pca.drop(columns=group_list)

X_train_bal, y_train_bal = balance(X_train_pca, y_train)



In [21]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train_bal, X_test_pca, y_train_bal, y_test)
    results[res_i] = {'Price': res['score'], 'Transformation': 'PCA', 'k':k}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9665625, 'confusion_matrix': array([[15127,   498],
       [   37,   338]]), 'sensibility': 0.9013333333333333, 'specificity': 0.968128, 'score': 23480}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9606875, 'confusion_matrix': array([[15021,   604],
       [   25,   350]]), 'sensibility': 0.9333333333333333, 'specificity': 0.961344, 'score': 18540}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9561875, 'confusion_matrix': array([[14942,   683],
       [   18,   357]]), 'sensibility': 0.952, 'specificity': 0.956288, 'score': 15830}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.953375, 'confusion_matrix': array([[14896,   729],
       [   17,   358]]), 'sensibility': 0.9546666666666667, 'specificity': 0.953344, 'score': 15790}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9508125, 'confusion_matrix': array([[14854,   771],
       [   16,   359]]), 'sensibility': 0.95733333

In [22]:
print(results)

{0: {'Price': 23480, 'Transformation': 'PCA', 'k': 11}, 1: {'Price': 18540, 'Transformation': 'PCA', 'k': 21}, 2: {'Price': 15830, 'Transformation': 'PCA', 'k': 31}, 3: {'Price': 15790, 'Transformation': 'PCA', 'k': 41}, 4: {'Price': 15710, 'Transformation': 'PCA', 'k': 51}, 5: {'Price': 16040, 'Transformation': 'PCA', 'k': 61}, 6: {'Price': 15230, 'Transformation': 'PCA', 'k': 71}, 7: {'Price': 15350, 'Transformation': 'PCA', 'k': 81}, 8: {'Price': 15430, 'Transformation': 'PCA', 'k': 91}, 9: {'Price': 15530, 'Transformation': 'PCA', 'k': 101}, 10: {'Price': 23480, 'Transformation': 'SMOTE 50-50', 'k': 11}, 11: {'Price': 18540, 'Transformation': 'SMOTE 50-50', 'k': 21}, 12: {'Price': 15840, 'Transformation': 'SMOTE 50-50', 'k': 31}, 13: {'Price': 15800, 'Transformation': 'SMOTE 50-50', 'k': 41}, 14: {'Price': 15710, 'Transformation': 'SMOTE 50-50', 'k': 51}, 15: {'Price': 16040, 'Transformation': 'SMOTE 50-50', 'k': 61}, 16: {'Price': 15230, 'Transformation': 'SMOTE 50-50', 'k': 71}, 

In [23]:
X_train, X_test = normalize(X_train, X_test)
X_train_bal, y_train_bal = balance(X_train, y_train)



In [24]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train_bal, X_test, y_train_bal, y_test)
    results[res_i] = {'Price': res['score'], 'Transformation': 'Normalization', 'k':k}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.961375, 'confusion_matrix': array([[15042,   583],
       [   35,   340]]), 'sensibility': 0.9066666666666666, 'specificity': 0.962688, 'score': 23330}
{'predicted': array([1, 0, 1, ..., 0, 0, 0]), 'accuracy': 0.9529375, 'confusion_matrix': array([[14896,   729],
       [   24,   351]]), 'sensibility': 0.936, 'specificity': 0.953344, 'score': 19290}
{'predicted': array([0, 0, 1, ..., 0, 0, 0]), 'accuracy': 0.9455625, 'confusion_matrix': array([[14772,   853],
       [   18,   357]]), 'sensibility': 0.952, 'specificity': 0.945408, 'score': 17530}
{'predicted': array([0, 0, 1, ..., 0, 0, 0]), 'accuracy': 0.940125, 'confusion_matrix': array([[14685,   940],
       [   18,   357]]), 'sensibility': 0.952, 'specificity': 0.93984, 'score': 18400}
{'predicted': array([0, 0, 1, ..., 0, 0, 0]), 'accuracy': 0.9360625, 'confusion_matrix': array([[14615,  1010],
       [   13,   362]]), 'sensibility': 0.9653333333333334, 'specificity': 0.9

In [25]:
print(results)

{0: {'Price': 23480, 'Transformation': 'PCA', 'k': 11}, 1: {'Price': 18540, 'Transformation': 'PCA', 'k': 21}, 2: {'Price': 15830, 'Transformation': 'PCA', 'k': 31}, 3: {'Price': 15790, 'Transformation': 'PCA', 'k': 41}, 4: {'Price': 15710, 'Transformation': 'PCA', 'k': 51}, 5: {'Price': 16040, 'Transformation': 'PCA', 'k': 61}, 6: {'Price': 15230, 'Transformation': 'PCA', 'k': 71}, 7: {'Price': 15350, 'Transformation': 'PCA', 'k': 81}, 8: {'Price': 15430, 'Transformation': 'PCA', 'k': 91}, 9: {'Price': 15530, 'Transformation': 'PCA', 'k': 101}, 10: {'Price': 23330, 'Transformation': 'Normalization', 'k': 11}, 11: {'Price': 19290, 'Transformation': 'Normalization', 'k': 21}, 12: {'Price': 17530, 'Transformation': 'Normalization', 'k': 31}, 13: {'Price': 18400, 'Transformation': 'Normalization', 'k': 41}, 14: {'Price': 16600, 'Transformation': 'Normalization', 'k': 51}, 15: {'Price': 16600, 'Transformation': 'Normalization', 'k': 61}, 16: {'Price': 18070, 'Transformation': 'Normalizatio

In [26]:
plt.figure(figsize=(16,6))
filename = 'knn_transformations'
measures = pd.DataFrame.from_dict(results, "index")
measures.to_csv('plot_data/{}.csv'.format(filename))
    
g = sns.FacetGrid(measures, hue="Transformation", size=8)
g = g.map(plt.scatter, "k", "Price").add_legend()
g = g.map(plt.plot, "k", "Price")
g.axes[0,0].set_ylim(ymin=0)

plt.savefig('images/{}.pdf'.format(filename))
plt.clf()



<Figure size 1152x432 with 0 Axes>

<Figure size 692.725x576 with 0 Axes>