In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics,
        aps_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.base import clone

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'

def getData():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def getDataWithThresh():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    train = train.dropna(thresh=150)
    train = train.dropna(axis=1, thresh=train.shape[0]*0.9)
    test = test[train.columns]

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def plot_comparisons(clf, data, filename='nb_comparison'):
    clf = clone(clf)
    sns.set(style='darkgrid')
    measures_dict = {}
    i = 0
    results = {}
    for technique in data:
        X_train, X_test, y_train = data[technique]
        print(technique)
        res = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
        score = aps_score(res['confusion_matrix'])
        measures_dict[i] = {'Technique': technique, 'Price': score}
        i += 1
    
    measures = pd.DataFrame.from_dict(measures_dict, "index")
    measures.to_csv('plot_data/{}.csv'.format(filename), index=False)

    return results

def plot_overfit_comparisons(clf, data, filename='nb_comparison_overfit'):
    clf = clone(clf)
    sns.set(style='darkgrid')
    measures_dict = {}
    i = 0
    results = {}
    for technique in data:
        X_train, X_test, y_train = data[technique]
        print(technique)
        res_train = aps_classifier_statistics(clf, X_train, X_train, y_train, y_train)
        res_test = aps_classifier_statistics(clf, X_train, X_test, y_train, y_test)
        measures_dict[i] = {'Technique': technique, 'Test': 'Train', 'Accuracy': res_train['accuracy'], 'ROC AUC': res_train['auc']}
        i += 1
        measures_dict[i] = {'Technique': technique, 'Test': 'Test', 'Accuracy': res_test['accuracy'], 'ROC AUC': res_test['auc']}
        i += 1
                            
    measures = pd.DataFrame.from_dict(measures_dict, "index")
    measures.to_csv('plot_data/{}.csv'.format(filename), index=False)

    return results

data = {}

In [2]:
clf = GaussianNB()

In [3]:
X_train, X_test, y_train, y_test = getData()
X_train_median, X_test_median = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())

data['Baseline'] = (X_train_median, X_test_median, y_train)

X_train, X_test, y_train, y_test = getDataWithThresh()
X_train_median, X_test_median = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())

data['Column and Lines Drop'] =  (X_train_median, X_test_median, y_train)

In [4]:
plot_comparisons(clf, data, filename='nb_comparison')

Baseline
Column and Lines Drop


{}

In [4]:
def attribute_corr(df, top):
        
    def get_redundant_pairs(df):
        '''Get diagonal and lower triangular pairs of correlation matrix'''
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n=5):
        au_corr = df.corr().abs().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

    
    return get_top_abs_correlations(df, top)

correlated_features = attribute_corr(X_train, 5)
correlated_features = correlated_features.reset_index()
print(correlated_features)

  level_0 level_1    0
0  bb_000  bv_000  1.0
1  ah_000  bg_000  1.0
2  bu_000  cq_000  1.0
3  aa_000  bt_000  1.0
4  bb_000  cq_000  1.0


In [5]:
equal_attrs = {}
all_attrs = set([])
sets = 0
for index, corr in correlated_features.iterrows():
    attr1 = corr['level_0']
    attr2 = corr['level_1']
    
    present = False
    if attr1 in equal_attrs:
        equal_attrs[attr1].append(attr1)
    for attr in equal_attrs:
        if attr1 in equal_attrs[attr] or attr2 in equal_attrs[attr]:
            equal_attrs[attr].add(attr1)
            equal_attrs[attr].add(attr2)
            present = True
            
    if not present:
        equal_attrs[sets] = set([attr1, attr2])
        sets += 1
excluded_columns = [item for item in all_attrs if item not in best_attrs]
print('Attribute sets {}'.format(equal_attrs))


Attribute sets {0: {'bb_000', 'bv_000', 'cq_000'}, 1: {'ah_000', 'bg_000'}, 2: {'bb_000', 'bu_000', 'cq_000'}, 3: {'aa_000', 'bt_000'}}


In [6]:
no_std_dev = []
for col in X_train:
    std_dev = standard_deviation(X_train[col])
    if std_dev == 0:
        no_std_dev.append(col)
        
print('Attributes with 0 standard deviation {}'.format(no_std_dev))

Attributes with 0 standard deviation ['cd_000']


In [14]:
X_train, X_test, y_train, y_test = getDataWithThresh()
X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
sets = {0: {'cc_000', 'bx_000'}, 1: {'al_000', 'am_0'}, 2: {0, 'an_000', 'ao_000'}}

X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

for group_i in sets:
    group = sets[group_i]
    group_list = list(group)
    group_list.pop(0)
    pca = PCA(n_components=1).fit(X_train[group_list])
    new = pca.transform(X_train[group_list])
    new_test = pca.transform(X_test[group_list])
    
    new = pd.DataFrame(data=new, columns=['{}'.format(group_i)])
    new_test = pd.DataFrame(data=new_test, columns=['{}'.format(group_i)])
    X_train = pd.concat([X_train, new], axis=1)
    X_test = pd.concat([X_test, new_test], axis=1)
    X_train, X_test = X_train.drop(columns=group_list), X_test.drop(columns=group_list)
    
X_train, X_test = X_train.drop(columns=['cd_000']), X_test.drop(columns=['cd_000'])
#X_train, X_test = X_train.drop(columns=excluded_columns), X_test.drop(columns=excluded_columns)

In [15]:
data['PCA'] =  (X_train, X_test, y_train)
plot_comparisons(clf, data, filename='nb_comparison')

Baseline
Column and Lines Drop
PCA


{}

In [9]:
data

{'Baseline': (       aa_000  ab_000        ac_000  ad_000  ae_000  af_000  ag_000  ag_001  \
  0       11558     0.0  0.000000e+00   126.0     0.0     0.0     0.0     0.0   
  1          10     0.0  6.000000e+00     6.0     0.0     0.0     0.0     0.0   
  2        1790     0.0  2.130706e+09    18.0     0.0     0.0     0.0     0.0   
  3       28436     0.0  2.092000e+03  1974.0     0.0     0.0     0.0     0.0   
  4       37936     0.0  2.400000e+02   220.0     0.0     0.0     0.0     0.0   
  5       48410     0.0  2.130706e+09  1152.0     0.0     0.0     0.0     0.0   
  6          30     0.0  4.000000e+01    30.0     0.0     0.0     0.0     0.0   
  7         778     0.0  2.130706e+09    18.0     0.0     0.0     0.0     0.0   
  8        8870     0.0  1.340000e+02   102.0     0.0     0.0     0.0     0.0   
  9      120206     0.0  4.900000e+02   386.0     0.0     0.0     0.0     0.0   
  10       1034     0.0  2.130706e+09     0.0     0.0     0.0     0.0     0.0   
  11      42080 

In [10]:
balancer = RandomUnderSampler(random_state=42)
X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
data['Undersample 50-50'] =  (X_train_bal, X_test, y_train_bal)
plot_comparisons(clf, data, filename='nb_comparison')

Baseline
Column and Lines Drop
PCA
Undersample 50-50


{}

In [11]:
plot_overfit_comparisons(clf, data, filename='nb_comparison_overfit')

Baseline
Column and Lines Drop
PCA
Undersample 50-50


{}