In [82]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'
train = pd.read_csv('./aps_failure_training_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')
test = pd.read_csv('./aps_failure_test_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')

X_train, y_train = split_dataset(train.dropna(thresh=150), CLASS)
X_test, y_test = split_dataset(test, CLASS)
y_train = y_train.map({'pos': 1, 'neg': 0})
y_test = y_test.map({'pos': 1, 'neg': 0})

X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

def normalize(X_train, X_test):
    normalizer = Normalizer().fit(X_train)

    X_train_norm = normalizer.transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    
    return X_train_norm, X_test_norm

aps = pd.concat([X_train, X_test])

clf = KNeighborsClassifier()
ks = np.arange(10, 101, 10)
results = {}
res_i = 0

In [83]:
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())

In [85]:
for k in ks:
    res = aps_classifier_statistics(KNeighborsClassifier(n_neighbors=k), X_train, X_test, y_train, y_test)
    results[res_i] = {'Price': res['score'], 'Transformation': 'Baseline'}
    res_i += 1
    print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9794375, 'confusion_matrix': array([[15597,    28],
       [  301,    74]]), 'sensibility': 0.19733333333333333, 'specificity': 0.998208, 'score': 150780}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.978625, 'confusion_matrix': array([[15615,    10],
       [  332,    43]]), 'sensibility': 0.11466666666666667, 'specificity': 0.99936, 'score': 166100}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9786875, 'confusion_matrix': array([[15616,     9],
       [  332,    43]]), 'sensibility': 0.11466666666666667, 'specificity': 0.999424, 'score': 166090}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9781875, 'confusion_matrix': array([[15616,     9],
       [  340,    35]]), 'sensibility': 0.09333333333333334, 'specificity': 0.999424, 'score': 170090}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.978, 'confusion_matrix': array([[15611,    14],
       [  338,    37]]), 'sensibil

In [None]:
print(results)

In [None]:
balancers = [(RandomUnderSampler(random_state=42), 'Undersample 50-50'), (RandomUnderSampler(ratio=0.3, random_state=42), 'Undersample 70-30'), (SMOTE(ratio=1.0, random_state=42), 'SMOTE 50-50'), (SMOTE(ratio=0.3, random_state=42), 'SMOTE 70-30')]
filename = 'knn_balancing'
measures = {}
i = 0

for k in ks:

    for balancer, name in balancers:
        X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
        res = aps_classifier_statistics(clf, X_train_bal, X_test, y_train_bal, y_test)
        measures[i] = {'Price': res['score'], 'Technique': name}
        i += 1
        
plt.figure(figsize=(14,6))
measures = pd.DataFrame.from_dict(measures, "index")
measures.to_csv('plot_data/{}.csv'.format(filename))
    
plt.figure(figsize=figsize)
g = sns.FacetGrid(measures, hue="Technique", size=8)
g = g.map(plt.scatter, param, "Price").add_legend()
g = g.map(plt.plot, param, "Price")
g.axes[0,0].set_ylim(ymin=0)

plt.savefig('images/{}.pdf'.format(filename))
plt.clf()

#results[res_i] = {'Price': measures[0]['Price'], 'Transformation': 'Undersampling 70-30'}
#res_i += 1"""



In [69]:
def balance(X_train, y_train):
    balancer = RandomUnderSampler(random_state=42)
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    return X_train_bal, y_train_bal

X_train_bal, y_train_bal = balance(X_train, y_train)

In [70]:
res = aps_classifier_statistics(clf, X_train_bal, X_test, y_train_bal, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Undersampling 50-50'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9108125, 'confusion_matrix': array([[14214,  1411],
       [   16,   359]]), 'sensibility': 0.9573333333333334, 'specificity': 0.909696, 'score': 22110}


Drop cd_000. THe results didn't get better, stayed the same

In [79]:
X_train, X_test = X_train.drop(columns=['cd_000']), X_test.drop(columns=['cd_000'])
X_train_bal, y_train_bal = balance(X_train, y_train)

In [73]:
res = aps_classifier_statistics(clf, X_train_bal, X_test, y_train_bal, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Baseline'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9108125, 'confusion_matrix': array([[14214,  1411],
       [   16,   359]]), 'sensibility': 0.9573333333333334, 'specificity': 0.909696, 'score': 22110}


In [74]:
#X_train, X_test = normalize(X_train, X_test)
#X_train_bal, y_train_bal = balance(X_train, y_train)

In [75]:
res = aps_classifier_statistics(clf, X_train_bal, X_test, y_train_bal, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Baseline'}
res_i += 1
print(res)

{'predicted': array([0, 0, 1, ..., 0, 0, 0]), 'accuracy': 0.8745, 'confusion_matrix': array([[13627,  1998],
       [   10,   365]]), 'sensibility': 0.9733333333333334, 'specificity': 0.872128, 'score': 24980}


In [80]:
sets = {0: {'bb_000', 'bv_000', 'bu_000', 'cq_000'}, 1: {'ah_000', 'bg_000'}, 2: {'bt_000', 'aa_000'}, 3: {'cf_000', 'co_000', 'ad_000'}}

for group_i in sets:
    group = sets[group_i]
    group_list = list(group)
    pca = PCA(n_components=1).fit(X_train[group_list])
    new = pca.transform(X_train[group_list])
    new_test = pca.transform(X_test[group_list])
    
    new = pd.DataFrame(data=new)
    new_test = pd.DataFrame(data=new_test)
    X_train = pd.concat([X_train, new], axis=1)
    X_test = pd.concat([X_test, new_test], axis=1)
    X_train, X_test = X_train.drop(columns=group_list), X_test.drop(columns=group_list)
    
X_train_bal, y_train_bal = balance(X_train, y_train)

In [81]:
res = aps_classifier_statistics(RandomForestClassifier(n_estimators=100), X_train_bal, X_test, y_train_bal, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Baseline'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9373125, 'confusion_matrix': array([[14630,   995],
       [    8,   367]]), 'sensibility': 0.9786666666666667, 'specificity': 0.93632, 'score': 13950}
