In [234]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'
train = pd.read_csv('./aps_failure_training_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')
test = pd.read_csv('./aps_failure_test_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')

train = train.dropna(thresh=150)

X_train, y_train = split_dataset(train, CLASS)
X_test, y_test = split_dataset(test, CLASS)
y_train = y_train.map({'pos': 1, 'neg': 0})
y_test = y_test.map({'pos': 1, 'neg': 0})

aps = pd.concat([X_train, X_test])

clf = GaussianNB()
results = {}
res_i = 0

Fill with 0

In [235]:
X_train, X_test = X_train.fillna(0), X_test.fillna(0)

In [236]:
res = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Baseline'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9605, 'confusion_matrix': array([[15013,   612],
       [   20,   355]]), 'sensibility': 0.9466666666666667, 'specificity': 0.960832, 'score': 16120}


In [231]:
X_train, X_test = X_train.drop(columns=['cd_000']), X_test.drop(columns=['cd_000'])

Drop column with no stddev

In [232]:
res = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Drop cd_000'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.96125, 'confusion_matrix': array([[15026,   599],
       [   21,   354]]), 'sensibility': 0.944, 'specificity': 0.961664, 'score': 16490}


In [237]:
sets = {0: {'bb_000', 'bv_000', 'bu_000', 'cq_000'}, 1: {'ah_000', 'bg_000'}, 2: {'bt_000', 'aa_000'}, 3: {'cf_000', 'co_000', 'ad_000'}}

X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

for group_i in sets:
    group = sets[group_i]
    group_list = list(group)
    pca = PCA(n_components=1).fit(X_train[group_list])
    new = pca.transform(X_train[group_list])
    new_test = pca.transform(X_test[group_list])
    
    new = pd.DataFrame(data=new)
    new_test = pd.DataFrame(data=new_test)
    X_train = pd.concat([X_train, new], axis=1)
    X_test = pd.concat([X_test, new_test], axis=1)
    #X_train, X_test = X_train.drop(columns=group_list), X_test.drop(columns=group_list)
    
#X_train, X_test = X_train.drop(columns=excluded_columns), X_test.drop(columns=excluded_columns)

In [181]:
X_train.shape[1]

175

In [238]:
res = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'PCA Attributes'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.959, 'confusion_matrix': array([[14989,   636],
       [   20,   355]]), 'sensibility': 0.9466666666666667, 'specificity': 0.959296, 'score': 16360}


Heuristic line removal

In [239]:
balancers = [(RandomUnderSampler(random_state=42), 'Undersample 50-50'), (RandomUnderSampler(ratio=0.3, random_state=42), 'Undersample 70-30'), (SMOTE(ratio=1.0, random_state=42), 'SMOTE 50-50'), (SMOTE(ratio=0.3, random_state=42), 'SMOTE 70-30')]
filename = 'nb_balancing'
measures = {}
i = 0

for balancer, name in balancers:
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    res = aps_classifier_statistics(clf, X_train_bal, X_test, y_train_bal, y_test)
    print(res)
    
    measures[i] = {'Price': res['score'], 'Technique': name}
    i += 1

results[res_i] = {'Price': measures[1]['Price'], 'Transformation': 'Undersampling 70-30'}
res_i += 1
"""
plt.figure(figsize=(12,6))
measures = pd.DataFrame.from_dict(measures, "index")
measures.to_csv('plot_data/{}.csv'.format(filename))
    
ax = sns.barplot(x='Technique', y='Price', data=measures)

for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '{}'.format(int(p.get_height())), 
        fontsize=8, color='black', ha='center', va='bottom')

plt.savefig('images/{}.pdf'.format(filename))
plt.clf()
"""

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.957125, 'confusion_matrix': array([[14959,   666],
       [   20,   355]]), 'sensibility': 0.9466666666666667, 'specificity': 0.957376, 'score': 16660}
{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.96275, 'confusion_matrix': array([[15060,   565],
       [   31,   344]]), 'sensibility': 0.9173333333333333, 'specificity': 0.96384, 'score': 21150}




{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.957375, 'confusion_matrix': array([[14962,   663],
       [   19,   356]]), 'sensibility': 0.9493333333333334, 'specificity': 0.957568, 'score': 16130}




{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9573125, 'confusion_matrix': array([[14961,   664],
       [   19,   356]]), 'sensibility': 0.9493333333333334, 'specificity': 0.957504, 'score': 16140}


'\nplt.figure(figsize=(12,6))\nmeasures = pd.DataFrame.from_dict(measures, "index")\nmeasures.to_csv(\'plot_data/{}.csv\'.format(filename))\n    \nax = sns.barplot(x=\'Technique\', y=\'Price\', data=measures)\n\nfor p in ax.patches:\n    ax.text(p.get_x() + p.get_width()/2., p.get_height(), \'{}\'.format(int(p.get_height())), \n        fontsize=8, color=\'black\', ha=\'center\', va=\'bottom\')\n\nplt.savefig(\'images/{}.pdf\'.format(filename))\nplt.clf()\n'

In [205]:
final_results = pd.DataFrame.from_dict(results, "index")
#filename = 'nb_final_results'

plt.figure(figsize=(12,6))
#final_results.to_csv('plot_data/{}.csv'.format(filename))
    
ax = sns.barplot(x='Transformation', y='Price', data=final_results)

for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '{}'.format(int(p.get_height())), 
        fontsize=8, color='black', ha='center', va='bottom')

#plt.savefig('images/{}.pdf'.format(filename))
plt.clf()

<Figure size 864x432 with 0 Axes>

In [224]:
print(final_results)

   Price       Transformation
0  16120             Baseline
1  16120          Drop cd_000
2  16400       PCA Attributes
3  21200  Undersampling 70-30


In [240]:
def bin_transform(X_train, X_test):
    bin_columns = ['ag_00', 'ay_00', 'az_00', 'ba_00', 'cn_00', 'cs_00', 'ee_00']

    for column in bin_columns:
        remove_cols = []
        first_column = '{}0'.format(column)
        remove_cols.append(first_column)
        new_col = X_train[first_column]
        new_col_test = X_test[first_column]
        #print(X_train[first_column].mode()[0])
        #print(X_train[first_column].value_counts()[0])

        for i in range(1, 10):
            col_name = '{}{}'.format(column, i)
            remove_cols.append(col_name)
            
        pca = PCA(0.95).fit(X_train[remove_cols])
        new = pca.transform(X_train[remove_cols])
        new_test = pca.transform(X_test[remove_cols])

        new = pd.DataFrame(data=new)
        new_test = pd.DataFrame(data=new_test)
        X_train = pd.concat([X_train, new], axis=1)
        X_test = pd.concat([X_test, new_test], axis=1)
        #X_train, X_test = X_train.drop(columns=group_list), X_test.drop(columns=group_list)

        
    return X_train, X_test

X_train, X_test = bin_transform(X_train, X_test)
X_train, y_train = RandomUnderSampler(ratio=0.3, random_state=42).fit_sample(X_train, y_train)



In [241]:
res = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'PCA Attributes'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9624375, 'confusion_matrix': array([[15053,   572],
       [   29,   346]]), 'sensibility': 0.9226666666666666, 'specificity': 0.963392, 'score': 20220}
