In [2]:
# Imports for data-preprocessing
import pandas as pd
import numpy as np
from __future__ import print_function

# Import for spliting the data set
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

# Import data
df = pd.read_csv('../data/master.csv')

In [3]:
# Delete unwanted variables
del df['workclass']
del df['education']
del df['education_num']
del df['marital_status']
del df['occupation']
del df['native_country'] 
del df['Unnamed: 0']

In [4]:
# Split the data into traning (80%) and test set (20%)
# We are using stratified cross validation here because the majority of the
#    individuals in the variable race are white

if 'income_binary' in df:
    y = df['income_binary'].values #get values we need 
    del df['income_binary']        #get rid of the class label
    X = df.values                  #use everything else to predict 
    
X = pd.get_dummies(df).values

scl = StandardScaler()
X = scl.fit_transform(X)

# Split the data into 20% Test and 80% Train
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.20, random_state=111)
sss.get_n_splits(X, y) #retreving the splits 



10

In [5]:
# Create a for loop that grabs the values for each fold for traing and test sets
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [15163 16279 29246 ..., 21159 41049 17832] TEST: [43578  1917 17027 ..., 22163 39121  5217]
TRAIN: [12894 40784 33210 ..., 40386 25846 36294] TEST: [22078 13800    49 ..., 31283 31586  5940]
TRAIN: [16738 39693 30388 ...,   328 33912 39362] TEST: [24310 48705 25069 ..., 47258 14625 39292]
TRAIN: [ 7391 39777 43398 ...,  8978 24399 34458] TEST: [ 8836  1328 27156 ..., 47164 10476 15648]
TRAIN: [16863 33361 41054 ..., 26744 47828 11941] TEST: [18495 35842 20752 ..., 46535  4696 46808]
TRAIN: [ 5743 21257 30549 ...,  5927  7506 19162] TEST: [21737 30911  7484 ..., 19717 27662 19780]
TRAIN: [18797 40559 21393 ..., 47376 19268 42562] TEST: [13861 34766  4320 ..., 42723 30153 11994]
TRAIN: [36977 11147 24500 ..., 40130 15262 22626] TEST: [13974 35810 43678 ...,  7735 14376 40480]
TRAIN: [32752 37107 19197 ..., 26015 32870  7076] TEST: [36370 13966 29812 ..., 20543 23045 34324]
TRAIN: [22441 41471 48039 ..., 32784  4446 45905] TEST: [  934  1627 37667 ...,  4950 44845 21169]


In [6]:
def test_classifier(classifier):
    iteration = 1
    scores = []
    for train_indices, test_indices in sss.split(X,y): 
        X_train = X[train_indices]  # train indices for X
        y_train = y[train_indices]  # train indices for y

        X_test = X[test_indices]    # test indices for X
        y_test = y[test_indices]    # test indices for y

        # train the reusable KNN classifier on the training data
        classifier.fit(X_train,y_train)  # train object
        y_hat = classifier.predict(X_test) # get test set precitions

        scores.append(mt.accuracy_score(y_hat,y_test))

    return np.mean(scores)

In [7]:
from sklearn import metrics as mt
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

func_dict = {"Nearest Neighbors" : KNeighborsClassifier(),
         #"Gaussian Process" : GaussianProcessClassifier(),
         "Decision Tree" : DecisionTreeClassifier(),
         "Random Forest" : RandomForestClassifier(),
         "Neural Net" : MLPClassifier(),
         "AdaBoost" : AdaBoostClassifier(),
         "Naive Bayes" : GaussianNB(),
         "QDA" : QuadraticDiscriminantAnalysis()}



In [8]:
accuracy_dict = {}
for classifier in func_dict:
    acc = test_classifier(func_dict[classifier])
    accuracy_dict[classifier] = acc
    print(accuracy_dict)

{'Nearest Neighbors': 0.82561162862114856}
{'Nearest Neighbors': 0.82561162862114856, 'Decision Tree': 0.82593919541406502}
{'Nearest Neighbors': 0.82561162862114856, 'Decision Tree': 0.82593919541406502, 'Random Forest': 0.83788514689323379}
{'Nearest Neighbors': 0.82561162862114856, 'Decision Tree': 0.82593919541406502, 'Random Forest': 0.83788514689323379, 'Neural Net': 0.84820350087009933}
{'Nearest Neighbors': 0.82561162862114856, 'Decision Tree': 0.82593919541406502, 'Random Forest': 0.83788514689323379, 'Neural Net': 0.84820350087009933, 'AdaBoost': 0.85558399017299624}
{'Nearest Neighbors': 0.82561162862114856, 'Decision Tree': 0.82593919541406502, 'Random Forest': 0.83788514689323379, 'Neural Net': 0.84820350087009933, 'AdaBoost': 0.85558399017299624, 'Naive Bayes': 0.76587163476302589}




{'Nearest Neighbors': 0.82561162862114856, 'Decision Tree': 0.82593919541406502, 'Random Forest': 0.83788514689323379, 'Neural Net': 0.84820350087009933, 'AdaBoost': 0.85558399017299624, 'Naive Bayes': 0.76587163476302589, 'QDA': 0.71779097144027015}
