In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv,
        plot_results_from_csv,
        aps_classifier_statistics,
        aps_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.base import clone

pd.options.display.max_columns = None
sns.set(style='darkgrid')

CLASS = 'class'

def getData():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

def getDataWithThresh():
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')

    train = train.dropna(thresh=150)
    train = train.dropna(axis=1, thresh=train.shape[0]*0.9)
    test = test[train.columns]

    X_train, y_train = split_dataset(train, CLASS)
    X_test, y_test = split_dataset(test, CLASS)
    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    
    return X_train, X_test, y_train, y_test

data = {}

nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=40)
dt = DecisionTreeClassifier(min_samples_split=400, random_state=42)
rf = RandomForestClassifier(min_samples_split=400, n_estimators=500, random_state=42, max_features=70)

results = {}
res_i = 0

In [2]:
# Naive Bayes 
# Best Results -- Thresh drop median and PCA and Undersample 50 - 50
X_train, X_test, y_train, y_test = getDataWithThresh()
X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
sets = {0: {'cc_000', 'bx_000'}, 1: {'al_000', 'am_0'}, 2: {0, 'an_000', 'ao_000'}}

X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

for group_i in sets:
    group = sets[group_i]
    group_list = list(group)
    group_list.pop(0)
    pca = PCA(n_components=1).fit(X_train[group_list])
    new = pca.transform(X_train[group_list])
    new_test = pca.transform(X_test[group_list])
    
    new = pd.DataFrame(data=new, columns=['{}'.format(group_i)])
    new_test = pd.DataFrame(data=new_test, columns=['{}'.format(group_i)])
    X_train = pd.concat([X_train, new], axis=1)
    X_test = pd.concat([X_test, new_test], axis=1)
    X_train, X_test = X_train.drop(columns=group_list), X_test.drop(columns=group_list)
    
X_train, X_test = X_train.drop(columns=['cd_000']), X_test.drop(columns=['cd_000'])

res = aps_classifier_statistics(nb, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Classifier': 'Naive Bayes'}
res_i += 1


In [3]:
# Knn
# Best results -- SMOTE 50-50 k = 40 mean

def balance(X_train, y_train):
    balancer = SMOTE(ratio=1.0, random_state=42)
    return balancer.fit_sample(X_train, y_train)

X_train, X_test, y_train, y_test = getData()
X_train, X_test = X_train.fillna(X_train.mean()), X_test.fillna(X_train.mean())
X_train, y_train = balance(X_train, y_train)

res = aps_classifier_statistics(knn, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Classifier': 'KNN'}
res_i += 1



In [4]:
results

{0: {'Price': 13290, 'Classifier': 'Naive Bayes'},
 1: {'Price': 13370, 'Classifier': 'KNN'}}

In [5]:
# Decision Tree
# Best results -- thresh and median undersampling 70-30 min_samples_split  = 400


def getDataFinal():
    X_train, X_test, y_train, y_test = getData()
    X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
    def balance(X_train, y_train):
        balancer = RandomUnderSampler(ratio=0.3, random_state=42)
        X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
        return X_train_bal, y_train_bal

    X_train, y_train = balance(X_train, y_train)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = getDataFinal()

res = aps_classifier_statistics(dt, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Classifier': 'Decision Tree'}
res_i += 1



In [6]:
results

{0: {'Price': 13290, 'Classifier': 'Naive Bayes'},
 1: {'Price': 13370, 'Classifier': 'KNN'},
 2: {'Price': 10680, 'Classifier': 'Decision Tree'}}

In [17]:
# Random Forest 
# Best results -- thresh and median undersampling 70-30 min_samples_split = 800


def getDataFinal():
    X_train, X_test, y_train, y_test = getData()
    X_train, X_test = X_train.fillna(X_train.median()), X_test.fillna(X_train.median())
    def balance(X_train, y_train):
        balancer = RandomUnderSampler(ratio=0.3, random_state=42)
        X_train_bal, y_train_bal = balancer.fit_sample(X_train, y_train)
        return X_train_bal, y_train_bal

    X_train, y_train = balance(X_train, y_train)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = getDataFinal()

res = aps_classifier_statistics(rf, X_train, X_test, y_train, y_test)
results[res_i] = {'Price': res['score'], 'Classifier': 'Random Forest'}
res_i += 1



In [18]:
results

{0: {'Price': 13290, 'Classifier': 'Naive Bayes'},
 1: {'Price': 13370, 'Classifier': 'KNN'},
 2: {'Price': 10680, 'Classifier': 'Decision Tree'},
 3: {'Price': 10720, 'Classifier': 'Random Forest'},
 4: {'Price': 10830, 'Classifier': 'Random Forest'},
 5: {'Price': 13270, 'Classifier': 'Random Forest'},
 6: {'Price': 10720, 'Classifier': 'Random Forest'}}

In [9]:
measures = pd.DataFrame.from_dict(results, "index")

In [10]:
measures.to_csv('plot_data/best_results.csv')