# Project_ADULT, Zachariah Gutierrez

## Dataset information

Name: Adult <br>
Data Type: Multivariate <br>
Default Task: Classification <br>
Attribute Types: categorical, integer <br>
No. Instances: 48842 <br>
No. Attributes: 14 <br>
Year: 1996 <br>
Task: Determine whether a person makes over 50K a year based on census data using SVM, KNN, and Random Forests

## Imports and functions

In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=DeprecationWarning)
    from sklearn.ensemble import RandomForestClassifier

In [3]:
# heatmap for LinearSVC
def draw_heatmap_linear(acc, acc_desc, C_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=C_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$C$')
    plt.title(acc_desc + ' w.r.t $C$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()

# heatmap for K-NearestNeighbors
def draw_heatmap_knn(acc, acc_desc, k_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=k_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$k$')
    plt.title(acc_desc + ' w.r.t $k$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()
    
# heatmap for RandomForest
def draw_heatmap_rf(acc, acc_desc, p_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=p_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$params$')
    plt.title(acc_desc + ' w.r.t $params$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()

## Load and transform the data

In [4]:
# import the data and set the column names
X_and_y = pd.read_csv('adult.data.txt', header=None)
X_and_y.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship',
              'race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']

print('Shape of original data is:', X_and_y.shape)

# shuffle the data then split X and y
X_and_y = X_and_y.sample(frac=1).reset_index(drop=True)
X = X_and_y.iloc[:,0:14]
y = X_and_y.iloc[:,-1]

Shape of original data is: (32561, 15)


In [5]:
# inspect the data
X_and_y.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,47,Self-emp-inc,127678,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,80,United-States,<=50K
1,24,Private,124963,HS-grad,9,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,30,United-States,<=50K
2,67,State-gov,261203,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,35,United-States,<=50K
3,50,State-gov,24790,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,<=50K
4,26,Local-gov,206721,Bachelors,13,Never-married,Protective-serv,Own-child,White,Male,0,0,40,United-States,<=50K


In [6]:
# one-hot encode the data (dummy variables)
X = pd.get_dummies(X)
X.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,47,127678,10,0,0,80,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,24,124963,9,0,0,30,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,67,261203,4,0,0,35,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,50,24790,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,26,206721,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
y.head()

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
Name: income, dtype: object

In [8]:
# Binarize y (maybe not neccessary)
y = y.to_frame().transpose()        

for i in y:
    if y[i][0] == ' <=50K':
        y[i][0] = 1
    elif y[i][0] == ' >50K':
        y[i][0] = 0
        
y = y.transpose()   # wouldn't work without transposing...
y.head()                

Unnamed: 0,income
0,1
1,1
2,1
3,1
4,1


In [9]:
# converts dataframes to np arrays
X = X.values    
y = y.values

In [10]:
# reshape y and change from object type
y = y.reshape(32561,)   # 32561 is original row count 
y = y.astype('int')     # y was an object, convert to int to avoid error

## Train the models (SVM, KNN, & RandomForest)

In [11]:
final_score_svm = []
final_score_knn = []
final_score_rf = []

### Adult 20/80

In [12]:
%%time

## Train every model on three random splits of 20/80

## Note: The split is non-deterministic. Repeating three times with the
## same split will give you somewhat different subsets. This is why three
## trials are needed to reach a consistent conclusion by taking the average.

print('Training three classifiers using three trials on Adult.data, split=20/80...')
print('(Accuracy scores are listed with respect to their individual hyperparameters.)')

# initialize the accuracy lists
avg_test_acc_svm = np.zeros((1,11))  # 11 for each value of C penalties
avg_test_acc_knn = np.zeros((1,26))  # 26 for each value of K neighbors
avg_test_acc_rf = np.zeros((1,12))   # 12 for each parameter combo in RF

for i in range(3):   # run three trials
    
    print('\nFor trial #', i+1)
    
    # split data into training/testing (20/80)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)
    
    # train SVM classifier
    C_list = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1, 10e0, 10e1, 10e2, 10e3]
    param_grid = {'C': C_list}
    clf = LinearSVC()
    SVM = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    SVM.fit(X_train, y_train)
    train_acc_svm = SVM.cv_results_['mean_train_score']
    test_acc_svm = SVM.cv_results_['mean_test_score']
    train_acc_svm = np.asarray(train_acc_svm)
    test_acc_svm = np.asarray(test_acc_svm)
    print('The SVM training accuracy is',train_acc_svm)
    print('The SVM testing accuracy is',test_acc_svm)
    avg_test_acc_svm = np.vstack((avg_test_acc_svm, test_acc_svm))
    
    # train KNN classifier
    K_list = list(range(1,27))         
    param_grid = {'n_neighbors': K_list}
    clf = KNeighborsClassifier()
    KNN = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    KNN.fit(X_train, y_train)
    train_acc_knn = KNN.cv_results_['mean_train_score']
    test_acc_knn = KNN.cv_results_['mean_test_score']
    train_acc_knn = np.asarray(train_acc_knn)
    test_acc_knn = np.asarray(test_acc_knn)
    print('\nThe KNN training accuracy is',train_acc_knn)
    print('The KNN testing accuracy is',test_acc_knn)
    avg_test_acc_knn = np.vstack((avg_test_acc_knn, test_acc_knn))
    
    # train RandomForest classifier
    param_grid = {'n_estimators': [100], 'max_features': ['auto', 'sqrt', 'log2'],
                  'max_depth': [4,5], 'criterion':['gini', 'entropy']}
    clf = RandomForestClassifier()
    RF = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    RF.fit(X_train, y_train)
    RFdf = pd.DataFrame.from_dict(data=RF.cv_results_)
    train_acc_rf = RF.cv_results_['mean_train_score']
    test_acc_rf = RF.cv_results_['mean_test_score']
    train_acc_rf = np.asarray(train_acc_rf)
    test_acc_rf = np.asarray(test_acc_rf)
    print('\nThe RF training accuracy is',train_acc_rf)
    print('The RF testing accuracy is',test_acc_rf)
    avg_test_acc_rf = np.vstack((avg_test_acc_rf, test_acc_rf))
    
# Begin displaying the testing accuracies.
print('\nReporting the average testing accuracies for all models...')

# Go through and delete first row of zeros (or else the avg suffers)
avg_test_acc_svm = np.delete(avg_test_acc_svm, 0, axis=0)
avg_test_acc_knn = np.delete(avg_test_acc_knn, 0, axis=0)
avg_test_acc_rf = np.delete(avg_test_acc_rf, 0, axis=0)

# SVM
avg_test_acc_svm = np.mean(avg_test_acc_svm, axis=0)
print('\nFor C =', C_list,'...')
print('The average test accuracy for SVM is', avg_test_acc_svm)

# KNN
avg_test_acc_knn = np.mean(avg_test_acc_knn, axis=0)
print('\nFor K =', K_list,'...')
print('The average test accuracy for KNN is', avg_test_acc_knn)

# RF
params = RFdf['params']
avg_test_acc_rf = np.mean(avg_test_acc_rf, axis=0)
print('\nFor the different parameters of Random Forest...')
print(params)
print('The average test accuracy is', avg_test_acc_rf)
print()

Training three classifiers using three trials on Adult.data, split=20/80...
(Accuracy scores are listed with respect to their individual hyperparameters.)

For trial # 1
The SVM training accuracy is [0.79365036 0.68218941 0.78455247 0.68706561 0.68330287 0.68386089
 0.47789083 0.67341863 0.58217339 0.78493494 0.78589516]
The SVM testing accuracy is [0.79345823 0.67890049 0.78501229 0.68458231 0.685043   0.68289312
 0.47757985 0.67260442 0.58415233 0.78608722 0.78624079]


KeyboardInterrupt: 

In [13]:
## Plot the heatmaps

print('The avg test scores for 20/80:')
print('SVM:', np.mean(avg_test_acc_svm))
print('KNN:', np.mean(avg_test_acc_knn))
print('RF:', np.mean(avg_test_acc_rf))
avg2080 = np.mean([np.mean(avg_test_acc_svm), np.mean(avg_test_acc_knn),
                   np.mean(avg_test_acc_rf)])

draw_heatmap_linear(avg_test_acc_svm.reshape(11,1), 'test_acc', C_list)
draw_heatmap_knn(avg_test_acc_knn.reshape(26,1), 'test_acc', K_list)
draw_heatmap_rf(avg_test_acc_rf.reshape(12,1), 'test_acc', params)

The avg test scores for 20/80:
SVM: 0.3462070024570024
KNN: 0.0
RF: 0.0


ValueError: cannot reshape array of size 22 into shape (11,1)

In [None]:
## Find the best hyperparameters

# SVM - best C
C_prime = SVM.best_params_['C']
print('Best C for SVM on Adult is:', C_prime)

# KNN - best K
K_prime = KNN.best_params_['n_neighbors']
print('Best K for KNN on Adult is:', K_prime)

# RF - best parameters
R_prime = RF.best_params_
print('Best RandomForest parameters for Adult are', R_prime)

In [None]:
## Train the optimal classifiers using the best hyperparameters
## Display final test scores

# SVM
SVM_prime = LinearSVC(C=C_prime)
SVM_prime.fit(X_train, y_train)
test_score_svm = SVM_prime.score(X_test, y_test)
final_score_svm.append(test_score_svm)

# KNN
KNN_prime = KNeighborsClassifier(n_neighbors=K_prime)
KNN_prime.fit(X_train, y_train)
test_score_knn = KNN_prime.score(X_test, y_test)
final_score_knn.append(test_score_knn)

# RF
RF_prime = RandomForestClassifier(criterion=R_prime['criterion'], max_depth=R_prime['max_depth'], 
                                  max_features=R_prime['max_features'], n_estimators=R_prime['n_estimators'])
RF_prime.fit(X_train, y_train)
test_score_rf = RF_prime.score(X_test, y_test)
final_score_rf.append(test_score_rf)

print('For Adult.data, 20/80 split...')
print('Test score for SVM is:', test_score_svm)
print('Test score for KNN is:', test_score_knn)
print('Test score for RF is:', test_score_rf)

### Adults 50/50

In [None]:
%%time

## Train every model on three random splits of 50/50

## Note: The split is non-deterministic. Repeating three times with the
## same split will give you somewhat different subsets. This is why three
## trials are needed to reach a consistent conclusion by taking the average.

print('Training three classifiers using three trials on Adult.data, split=50/50...')
print('(Accuracy scores are listed with respect to their individual hyperparameters.)')

# initialize the accuracy lists
avg_test_acc_svm = np.zeros((1,11))  # 11 for each value of C penalties
avg_test_acc_knn = np.zeros((1,26))  # 26 for each value of K neighbors
avg_test_acc_rf = np.zeros((1,12))   # 12 for each parameter combo in RF

for i in range(3):   # run three trials
    
    print('\nFor trial #', i+1)
    
    # split data into training/testing (50/50)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)
    
    # train SVM classifier
    C_list = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1, 10e0, 10e1, 10e2, 10e3]
    param_grid = {'C': C_list}
    clf = LinearSVC()
    SVM = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    SVM.fit(X_train, y_train)
    train_acc_svm = SVM.cv_results_['mean_train_score']
    test_acc_svm = SVM.cv_results_['mean_test_score']
    train_acc_svm = np.asarray(train_acc_svm)
    test_acc_svm = np.asarray(test_acc_svm)
    print('The SVM training accuracy is',train_acc_svm)
    print('The SVM testing accuracy is',test_acc_svm)
    avg_test_acc_svm = np.vstack((avg_test_acc_svm, test_acc_svm))
    
    # train KNN classifier
    K_list = list(range(1,27))
    param_grid = {'n_neighbors': K_list}
    clf = KNeighborsClassifier()
    KNN = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    KNN.fit(X_train, y_train)
    train_acc_knn = KNN.cv_results_['mean_train_score']
    test_acc_knn = KNN.cv_results_['mean_test_score']
    train_acc_knn = np.asarray(train_acc_knn)
    test_acc_knn = np.asarray(test_acc_knn)
    print('\nThe KNN training accuracy is',train_acc_knn)
    print('The KNN testing accuracy is',test_acc_knn)
    avg_test_acc_knn = np.vstack((avg_test_acc_knn, test_acc_knn))
    
    # train RandomForest classifier
    param_grid = {'n_estimators': [100], 'max_features': ['auto', 'sqrt', 'log2'],
                  'max_depth': [4,5], 'criterion':['gini', 'entropy']}
    clf = RandomForestClassifier()
    RF = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    RF.fit(X_train, y_train)
    RFdf = pd.DataFrame.from_dict(data=RF.cv_results_)
    train_acc_rf = RF.cv_results_['mean_train_score']
    test_acc_rf = RF.cv_results_['mean_test_score']
    train_acc_rf = np.asarray(train_acc_rf)
    test_acc_rf = np.asarray(test_acc_rf)
    print('\nThe RF training accuracy is',train_acc_rf)
    print('The RF testing accuracy is',test_acc_rf)
    avg_test_acc_rf = np.vstack((avg_test_acc_rf, test_acc_rf))
    
# Begin displaying the testing accuracies.
print('\nReporting the average testing accuracies for all models...')

# Go through and delete first row of zeros (or else the avg suffers)
avg_test_acc_svm = np.delete(avg_test_acc_svm, 0, axis=0)
avg_test_acc_knn = np.delete(avg_test_acc_knn, 0, axis=0)
avg_test_acc_rf = np.delete(avg_test_acc_rf, 0, axis=0)

# SVM
avg_test_acc_svm = np.mean(avg_test_acc_svm, axis=0)
print('\nFor C =', C_list,'...')
print('The average test accuracy for SVM is', avg_test_acc_svm)

# KNN
avg_test_acc_knn = np.mean(avg_test_acc_knn, axis=0)
print('\nFor K =', K_list,'...')
print('The average test accuracy for KNN is', avg_test_acc_knn)

# RF
params = RFdf['params']
avg_test_acc_rf = np.mean(avg_test_acc_rf, axis=0)
print('\nFor the different parameters of Random Forest...')
print(params)
print('The average test accuracy is', avg_test_acc_rf)
print()

In [None]:
## Plot the heatmaps

print('The avg test scores for 50/50:')
print('SVM:', np.mean(avg_test_acc_svm))
print('KNN:', np.mean(avg_test_acc_knn))
print('RF:', np.mean(avg_test_acc_rf))
avg5050 = np.mean([np.mean(avg_test_acc_svm), np.mean(avg_test_acc_knn),
                   np.mean(avg_test_acc_rf)])

draw_heatmap_linear(avg_test_acc_svm.reshape(11,1), 'test_acc', C_list)
draw_heatmap_knn(avg_test_acc_knn.reshape(26,1), 'test_acc', K_list)
draw_heatmap_rf(avg_test_acc_rf.reshape(12,1), 'test_acc', params)

In [None]:
## Find the best hyperparameters

# SVM - best C
C_prime = SVM.best_params_['C']
print('Best C for SVM on Adult is:', C_prime)

# KNN - best K
K_prime = KNN.best_params_['n_neighbors']
print('Best K for KNN on Adult is:', K_prime)

# RF - best parameters
R_prime = RF.best_params_
print('Best RandomForest parameters for Adult are', R_prime)

In [None]:
## Train the optimal classifiers using the best hyperparameters
## Display final test scores

# SVM
SVM_prime = LinearSVC(C=C_prime)
SVM_prime.fit(X_train, y_train)
test_score_svm = SVM_prime.score(X_test, y_test)
final_score_svm.append(test_score_svm)

# KNN
KNN_prime = KNeighborsClassifier(n_neighbors=K_prime)
KNN_prime.fit(X_train, y_train)
test_score_knn = KNN_prime.score(X_test, y_test)
final_score_knn.append(test_score_knn)

# RF
RF_prime = RandomForestClassifier(criterion=R_prime['criterion'], max_depth=R_prime['max_depth'], 
                                  max_features=R_prime['max_features'], n_estimators=R_prime['n_estimators'])
RF_prime.fit(X_train, y_train)
test_score_rf = RF_prime.score(X_test, y_test)
final_score_rf.append(test_score_rf)

print('For Adult.data, 50/50 split...')
print('Test score for SVM is:', test_score_svm)
print('Test score for KNN is:', test_score_knn)
print('Test score for RF is:', test_score_rf)

### Adults 80/20

In [None]:
%%time

## Train every model on three random splits of 80/20

## Note: The split is non-deterministic. Repeating three times with the
## same split will give you somewhat different subsets. This is why three
## trials are needed to reach a consistent conclusion by taking the average.

print('Training three classifiers using three trials on Adult.data, split=80/20...')
print('(Accuracy scores are listed with respect to their individual hyperparameters.)')

# initialize the accuracy lists
avg_test_acc_svm = np.zeros((1,11))  # 11 for each value of C penalties
avg_test_acc_knn = np.zeros((1,26))  # 26 for each value of K neighbors
avg_test_acc_rf = np.zeros((1,12))   # 12 for each parameter combo in RF

for i in range(3):   # run three trials
    
    print('\nFor trial #', i+1)
    
    # split data into training/testing (80/20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    
    # train SVM classifier
    C_list = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1, 10e0, 10e1, 10e2, 10e3]
    param_grid = {'C': C_list}
    clf = LinearSVC()
    SVM = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    SVM.fit(X_train, y_train)
    train_acc_svm = SVM.cv_results_['mean_train_score']
    test_acc_svm = SVM.cv_results_['mean_test_score']
    train_acc_svm = np.asarray(train_acc_svm)
    test_acc_svm = np.asarray(test_acc_svm)
    print('The SVM training accuracy is',train_acc_svm)
    print('The SVM testing accuracy is',test_acc_svm)
    avg_test_acc_svm = np.vstack((avg_test_acc_svm, test_acc_svm))
    
    # train KNN classifier
    K_list = list(range(1,27)) 
    param_grid = {'n_neighbors': K_list}
    clf = KNeighborsClassifier()
    KNN = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    KNN.fit(X_train, y_train)
    train_acc_knn = KNN.cv_results_['mean_train_score']
    test_acc_knn = KNN.cv_results_['mean_test_score']
    train_acc_knn = np.asarray(train_acc_knn)
    test_acc_knn = np.asarray(test_acc_knn)
    print('\nThe KNN training accuracy is',train_acc_knn)
    print('The KNN testing accuracy is',test_acc_knn)
    avg_test_acc_knn = np.vstack((avg_test_acc_knn, test_acc_knn))
    
    # train RandomForest classifier
    param_grid = {'n_estimators': [100], 'max_features': ['auto', 'sqrt', 'log2'],
                  'max_depth': [4,5], 'criterion':['gini', 'entropy']}
    clf = RandomForestClassifier()
    RF = GridSearchCV(clf, param_grid, cv=5, return_train_score = True)
    RF.fit(X_train, y_train)
    RFdf = pd.DataFrame.from_dict(data=RF.cv_results_)
    train_acc_rf = RF.cv_results_['mean_train_score']
    test_acc_rf = RF.cv_results_['mean_test_score']
    train_acc_rf = np.asarray(train_acc_rf)
    test_acc_rf = np.asarray(test_acc_rf)
    print('\nThe RF training accuracy is',train_acc_rf)
    print('The RF testing accuracy is',test_acc_rf)
    avg_test_acc_rf = np.vstack((avg_test_acc_rf, test_acc_rf))
    
# Begin displaying the testing accuracies.
print('\nReporting the average testing accuracies for all models...')

# Go through and delete first row of zeros (or else the avg suffers)
avg_test_acc_svm = np.delete(avg_test_acc_svm, 0, axis=0)
avg_test_acc_knn = np.delete(avg_test_acc_knn, 0, axis=0)
avg_test_acc_rf = np.delete(avg_test_acc_rf, 0, axis=0)

# SVM
avg_test_acc_svm = np.mean(avg_test_acc_svm, axis=0)
print('\nFor C =', C_list,'...')
print('The average test accuracy for SVM is', avg_test_acc_svm)

# KNN
avg_test_acc_knn = np.mean(avg_test_acc_knn, axis=0)
print('\nFor K =', K_list,'...')
print('The average test accuracy for KNN is', avg_test_acc_knn)

# RF
params = RFdf['params']
avg_test_acc_rf = np.mean(avg_test_acc_rf, axis=0)
print('\nFor the different parameters of Random Forest...')
print(params)
print('The average test accuracy is', avg_test_acc_rf)
print()

In [None]:
## Plot the heatmaps

print('The avg test scores for 80/20:')
print('SVM:', np.mean(avg_test_acc_svm))
print('KNN:', np.mean(avg_test_acc_knn))
print('RF:', np.mean(avg_test_acc_rf))
avg8020 = np.mean([np.mean(avg_test_acc_svm), np.mean(avg_test_acc_knn),
                   np.mean(avg_test_acc_rf)])

draw_heatmap_linear(avg_test_acc_svm.reshape(11,1), 'test_acc', C_list)
draw_heatmap_knn(avg_test_acc_knn.reshape(26,1), 'test_acc', K_list)
draw_heatmap_rf(avg_test_acc_rf.reshape(12,1), 'test_acc', params)

In [None]:
## Find the best hyperparameters

# SVM - best C
C_prime = SVM.best_params_['C']
print('Best C for SVM on Adult is:', C_prime)

# KNN - best K
K_prime = KNN.best_params_['n_neighbors']
print('Best K for KNN on Adult is:', K_prime)

# RF - best parameters
R_prime = RF.best_params_
print('Best RandomForest parameters for Adult are', R_prime)

In [None]:
## Train the optimal classifiers using the best hyperparameters
## Display final test scores

# SVM
SVM_prime = LinearSVC(C=C_prime)
SVM_prime.fit(X_train, y_train)
test_score_svm = SVM_prime.score(X_test, y_test)
final_score_svm.append(test_score_svm)

# KNN
KNN_prime = KNeighborsClassifier(n_neighbors=K_prime)
KNN_prime.fit(X_train, y_train)
test_score_knn = KNN_prime.score(X_test, y_test)
final_score_knn.append(test_score_knn)

# RF
RF_prime = RandomForestClassifier(criterion=R_prime['criterion'], max_depth=R_prime['max_depth'], 
                                  max_features=R_prime['max_features'], n_estimators=R_prime['n_estimators'])
RF_prime.fit(X_train, y_train)
test_score_rf = RF_prime.score(X_test, y_test)
final_score_rf.append(test_score_rf)

print('For Adult.data, 80/20 split...')
print('Test score for SVM is:', test_score_svm)
print('Test score for KNN is:', test_score_knn)
print('Test score for RF is:', test_score_rf)

## Display the final results

In [None]:
## Graph the individual split scores

print(avg2080, avg5050, avg8020)

objects = ('20/80', '50/50', '80/20')
y_pos = np.arange(len(objects))
performance = [avg2080, avg5050, avg8020]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Accuracy score')
plt.title('Split Results for Adult.data')
plt.show()

In [None]:
## Calculate average test scores for all splits

print('For Adult.data, the final test scores are:')
print('SVM:', np.mean(final_score_svm))
print('KNN:', np.mean(final_score_knn))
print('RF:', np.mean(final_score_rf))

In [None]:
## Graph the final test scores

objects = ('SVM', 'KNN', 'RandomForests')
y_pos = np.arange(len(objects))
performance = [np.mean(final_score_svm), np.mean(final_score_knn), np.mean(final_score_rf)]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Accuracy score')
plt.title('Final Results for Adult.data')
plt.show()

# End