In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE

pd.options.display.max_columns = None
sns.set(style='darkgrid')

base_clfs = [BernoulliNB(), DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(n_estimators=100)]
#base_clfs = [RandomForestClassifier(n_estimators=100)]

def print_missing_percentage(df, name='Dataset'):
    print('{} missing values percentage: {}'.format(name, (df.shape[0] - df.dropna().shape[0]) / df.shape[0] * 100))

CLASS = 'class'
train = pd.read_csv('./aps_failure_training_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')
test = pd.read_csv('./aps_failure_test_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')

print_missing_percentage(train, 'Train')
print_missing_percentage(test, 'Test')

X_train, y_train = split_dataset(train, CLASS)
X_test, y_test = split_dataset(test, CLASS)
y_train = y_train.map({'pos': 1, 'neg': 0})
y_test = y_test.map({'pos': 1, 'neg': 0})

aps = pd.concat([X_train, X_test])

Train missing values percentage: 99.015
Test missing values percentage: 98.96875


In [2]:
def attribute_corr(df, top):
        
    def get_redundant_pairs(df):
        '''Get diagonal and lower triangular pairs of correlation matrix'''
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5):
        au_corr = df.corr().abs().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    
    return get_top_abs_correlations(df, top)

correlated_features = attribute_corr(aps, 11)
correlated_features = correlated_features.reset_index()
print(correlated_features)

   level_0 level_1    0
0   bb_000  bv_000  1.0
1   ah_000  bg_000  1.0
2   aa_000  bt_000  1.0
3   bv_000  cq_000  1.0
4   bb_000  cq_000  1.0
5   bu_000  cq_000  1.0
6   bu_000  bv_000  1.0
7   bb_000  bu_000  1.0
8   cf_000  co_000  1.0
9   ad_000  cf_000  1.0
10  ad_000  co_000  1.0


In [3]:
equal_attrs = {}
all_attrs = set([])
sets = 0
for index, corr in correlated_features.iterrows():
    attr1 = corr['level_0']
    attr2 = corr['level_1']
    
    present = False
    if attr1 in equal_attrs:
        equal_attrs[attr1].append(attr1)
    for attr in equal_attrs:
        if attr1 in equal_attrs[attr] or attr2 in equal_attrs[attr]:
            equal_attrs[attr].add(attr1)
            equal_attrs[attr].add(attr2)
            present = True
            
    if not present:
        equal_attrs[sets] = set([attr1, attr2])
        sets += 1
        
best_attrs = []
all_attrs = []
for attrs in equal_attrs:
    best_attr = None
    best_nmr_missing = 600000
    for attr in equal_attrs[attrs]:
        all_attrs.append(attr)
        nmr_missing = aps[attr].isna().sum()
        if nmr_missing < best_nmr_missing:
            best_attr = attr
            best_nmr_missing = nmr_missing
        
    best_attrs.append(best_attr)

excluded_columns = [item for item in all_attrs if item not in best_attrs]
print('Attribute sets {}'.format(equal_attrs))
print('Best attributes to select {}'.format(best_attrs))
print('Excluded columns {}'.format(excluded_columns))

aps = aps.drop(aps[excluded_columns], axis=1)
X_train = X_train.drop(X_train[excluded_columns], axis=1)
X_test = X_test.drop(X_test[excluded_columns], axis=1)

Attribute sets {0: {'bb_000', 'cq_000', 'bu_000', 'bv_000'}, 1: {'ah_000', 'bg_000'}, 2: {'aa_000', 'bt_000'}, 3: {'cf_000', 'co_000', 'ad_000'}}
Best attributes to select ['bb_000', 'bg_000', 'aa_000', 'cf_000']
Excluded columns ['cq_000', 'bu_000', 'bv_000', 'ah_000', 'bt_000', 'co_000', 'ad_000']


In [4]:
no_std_dev = []
for col in aps:
    std_dev = standard_deviation(aps[col])
    if std_dev == 0:
        no_std_dev.append(col)
        
print('Attributes with 0 standard deviation {}'.format(no_std_dev))
aps = aps.drop(aps[no_std_dev], axis=1)
X_train = X_train.drop(X_train[no_std_dev], axis=1)
X_test = X_test.drop(X_test[no_std_dev], axis=1)

Attributes with 0 standard deviation ['cd_000']


In [None]:
X_train_zero, X_test_zero = X_train.fillna(0), X_test.fillna(0)
X_train_mean, X_test_mean = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())
X_train_median, X_test_median = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
X_train_mfrequent, X_test_mfrequent = X_train, X_test
for col in X_train:
    mode = X_train[col].dropna().mode()[0]
    X_train_mfrequent[col] = X_train_mfrequent[col].fillna(mode) 
    X_test_mfrequent[col] = X_test_mfrequent[col].fillna(mode) 

X_data = {'Zero replace': (X_train_zero, X_test_zero), 'Mean': (X_train_mean, X_test_mean), 'Median': (X_train_median, X_test_median), 'Most Frequent': (X_train_mfrequent, X_test_mfrequent)}
plot_comparison_results(base_clfs, X_data, y_train, y_test, technique='Technique', filename='missing_values', figsize=(20, 6))