In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'
train = pd.read_csv('./aps_failure_training_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')
test = pd.read_csv('./aps_failure_test_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')

X_train, y_train = split_dataset(train, CLASS)
X_test, y_test = split_dataset(test, CLASS)
y_train = y_train.map({'pos': 1, 'neg': 0})
y_test = y_test.map({'pos': 1, 'neg': 0})

X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

def normalize(X_train, X_test):
    normalizer = Normalizer().fit(X_train)

    X_train_norm = normalizer.transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    
    return X_train_norm, X_test_norm

aps = pd.concat([X_train, X_test])

clf = DecisionTreeClassifier(random_state=42)
results = {}
res_i = 0

In [40]:
X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())

In [42]:
res = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Baseline'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9889375, 'confusion_matrix': array([[15561,    64],
       [  113,   262]]), 'sensibility': 0.6986666666666667, 'specificity': 0.995904, 'score': 57140}


In [34]:
X_train, X_test = X_train.drop(columns=['cd_000']), X_test.drop(columns=['cd_000'])

ValueError: labels ['cd_000'] not contained in axis

In [35]:
res = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Drop cd_000'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.989, 'confusion_matrix': array([[15556,    69],
       [  107,   268]]), 'sensibility': 0.7146666666666667, 'specificity': 0.995584, 'score': 54190}


In [37]:
sets = {0: {'bb_000', 'bv_000', 'bu_000', 'cq_000'}, 1: {'ah_000', 'bg_000'}, 2: {'bt_000', 'aa_000'}, 3: {'cf_000', 'co_000', 'ad_000'}}

X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

for group_i in sets:
    group = sets[group_i]
    group_list = list(group)
    pca = PCA(n_components=1).fit(X_train[group_list])
    new = pca.transform(X_train[group_list])
    new_test = pca.transform(X_test[group_list])
    
    new = pd.DataFrame(data=new)
    new_test = pd.DataFrame(data=new_test)
    X_train = pd.concat([X_train, new], axis=1)
    X_test = pd.concat([X_test, new_test], axis=1)
    X_train, X_test = X_train.drop(columns=group_list), X_test.drop(columns=group_list)

KeyError: "['bv_000' 'bu_000' 'bb_000' 'cq_000'] not in index"

In [38]:
res = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Transformation': 'Drop cd_000'}
res_i += 1
print(res)

{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9885, 'confusion_matrix': array([[15553,    72],
       [  112,   263]]), 'sensibility': 0.7013333333333334, 'specificity': 0.995392, 'score': 56720}


In [16]:
balancers = [(RandomUnderSampler(random_state=42), 'Undersample 50-50'), (RandomUnderSampler(ratio=0.3, random_state=42), 'Undersample 70-30'), (SMOTE(ratio=1.0, random_state=42), 'SMOTE 50-50'), (SMOTE(ratio=0.3, random_state=42), 'SMOTE 70-30')]
filename = 'nb_balancing'
measures = {}
i = 0

for balancer, name in balancers:
    X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
    res = aps_classifier_statistics(clf, X_train_bal, X_test, y_train_bal, y_test)
    print(res)
    
    measures[i] = {'Price': res['score'], 'Technique': name}
    i += 1



{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.929125, 'confusion_matrix': array([[14513,  1112],
       [   22,   353]]), 'sensibility': 0.9413333333333334, 'specificity': 0.928832, 'score': 22120}




{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9596875, 'confusion_matrix': array([[15018,   607],
       [   38,   337]]), 'sensibility': 0.8986666666666666, 'specificity': 0.961152, 'score': 25070}




{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9843125, 'confusion_matrix': array([[15478,   147],
       [  104,   271]]), 'sensibility': 0.7226666666666667, 'specificity': 0.990592, 'score': 53470}




{'predicted': array([0, 0, 0, ..., 0, 0, 0]), 'accuracy': 0.9869375, 'confusion_matrix': array([[15507,   118],
       [   91,   284]]), 'sensibility': 0.7573333333333333, 'specificity': 0.992448, 'score': 46680}
