In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('adult.data', sep=",", header=None)
data.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","income"]
data = data.replace(' ','', regex=True)
data_continuous = data.iloc[:,[0,2,4,10,11,12]]
data_categorical = data.iloc[:,[1,3,5,6,7,8,9,13,14]]
data_dummies = pd.get_dummies(data_categorical)
data_dummies = data_dummies.drop(['sex_Female', 'income_<=50K'], axis=1)

cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
d_s=StandardScaler()
d_n=d_s.fit_transform(data_continuous)

In [3]:
# check if it came out correctly
print(data_dummies)

       workclass_?  workclass_Federal-gov  workclass_Local-gov  \
0                0                      0                    0   
1                0                      0                    0   
2                0                      0                    0   
3                0                      0                    0   
4                0                      0                    0   
5                0                      0                    0   
6                0                      0                    0   
7                0                      0                    0   
8                0                      0                    0   
9                0                      0                    0   
10               0                      0                    0   
11               0                      0                    0   
12               0                      0                    0   
13               0                      0                    0   
14        

In [4]:
# Preprocessing the data more

X_and_Y = np.hstack([d_n, data_dummies.values])
print(X_and_Y)
np.random.shuffle(X_and_Y)

# Let's use income >50k = 1 and income <= 50k = 0
# also sex_Male = 1 and sex_Female = 0
# So we omit second to last column
# cut out 64 and 65
X = X_and_Y[:, 0:-1]
Y = X_and_Y[:, -1]
print(X.shape, Y.shape)

[[ 0.03067056 -1.06361075  1.13473876 ...,  0.          0.          0.        ]
 [ 0.83710898 -1.008707    1.13473876 ...,  0.          0.          0.        ]
 [-0.04264203  0.2450785  -0.42005962 ...,  0.          0.          0.        ]
 ..., 
 [ 1.42360965 -0.35877741 -0.42005962 ...,  0.          0.          0.        ]
 [-1.21564337  0.11095988 -0.42005962 ...,  0.          0.          0.        ]
 [ 0.98373415  0.92989258 -0.42005962 ...,  0.          0.          1.        ]]
(32561, 107) (32561,)


In [5]:
X_train_val = X[0:5000,:]
X_test      = X[5000:65122,:]
Y_train_val = Y[0:5000]
Y_test      = Y[5000:65122]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(5000, 107) (27561, 107) (5000,) (27561,)


In [6]:
# SVM process

# linear
# polynomial degree 2 & 3 (this one is using 3)
# radial with width [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
# regularization parameter C = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

classifier = svm.SVC(kernel='linear')
C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]   
params = {"C": C_list}

GSR = GridSearchCV(classifier, params, return_train_score=True, cv=5, n_jobs=4)
GSR.fit(X_train_val,Y_train_val)
GSR.cv_results_
                               
# May also want to try rbf kernel with sigma (radial width values)

{'mean_fit_time': array([   2.0580883 ,    2.91160626,    2.77601175,    2.58106384,
           2.71593165,    2.29113712,    2.33238316,    2.22649531,
           4.22599745,   22.1912158 ,  169.62264261]),
 'mean_score_time': array([ 0.28880515,  0.38318777,  0.38718395,  0.36161385,  0.45792336,
         0.32182565,  0.19583931,  0.20544505,  0.18773623,  0.19398918,
         0.16972737]),
 'mean_test_score': array([ 0.7656,  0.7656,  0.7656,  0.7656,  0.7764,  0.8424,  0.851 ,
         0.8496,  0.8482,  0.8486,  0.8486]),
 'mean_train_score': array([ 0.76560001,  0.76560001,  0.76560001,  0.76560001,  0.77700006,
         0.84340004,  0.85545056,  0.85960036,  0.85960029,  0.85985037,
         0.85945034]),
 'param_C': masked_array(data = [1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0],
              mask = [False False False False False False False False False False False],
        fill_value = ?),
 'params': ({'C': 1e-07},
  {'C': 1e-06},
  {'C': 1e-05},
  {'C': 0

In [7]:
# find best C and best accuracy
test_acc = sum(GSR.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print(GSR.best_params_)
print(test_acc)

{'C': 0.1}
0.850585972933


In [10]:
def convert_to_gamma(radial_width):
    converted = [0]*len(radial_width)
    for i in range(len(radial_width)):
        gamma = 1/(2*(radial_width[i]**2))
        converted[i] = gamma
    return converted

In [11]:
radial_width = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
gamma_list = convert_to_gamma(radial_width)
print(gamma_list)

[500000.0, 20000.0, 5000.0, 199.99999999999997, 49.99999999999999, 2.0, 0.5, 0.125]


In [12]:
# SVM process

# rbf
# polynomial degree 2 & 3 (this one is using 3)
# radial with width [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
# regularization parameter C = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

classifier = svm.SVC(kernel='rbf')
C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]
params = {"C": C_list, "gamma": gamma_list}

GSR = GridSearchCV(classifier, params, return_train_score=True, cv=5, n_jobs=4)
GSR.fit(X_train_val,Y_train_val)
GSR.cv_results_
                               
# May also want to try rbf kernel with sigma (radial width values)

{'mean_fit_time': array([ 2.13540063,  2.80606585,  2.74499793,  2.355301  ,  3.07020168,
         2.03882284,  2.08474121,  1.90218077,  2.3446321 ,  2.20074697,
         2.11868954,  2.34547229,  3.01741643,  1.97998805,  1.89643869,
         1.76561022,  2.08947082,  2.11484399,  2.15376396,  2.07388506,
         3.44952211,  2.49628716,  1.95049963,  1.76340666,  2.38401809,
         2.73108149,  2.44434686,  2.1040659 ,  3.17129025,  2.73096504,
         2.75358052,  2.66493001,  4.09296961,  4.16821895,  3.90213299,
         4.42502732,  5.40140848,  2.71094375,  2.87726297,  2.8208569 ,
         5.52343535,  5.83892331,  5.67060618,  5.70213218,  7.70921469,
         5.22078795,  4.30549159,  2.83511596,  5.60458131,  5.72343073,
         5.67222409,  4.69956365,  6.3971271 ,  5.71423984,  4.48229017,
         1.94760013,  5.88159895,  4.70265551,  5.02112837,  4.99126468,
         7.49415789,  4.26055408,  3.62634029,  1.80030432,  6.88047156,
         6.78266463,  6.62332153, 

In [13]:
# find best C and best accuracy
test_acc = sum(GSR.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print(GSR.best_params_)
print(test_acc)

{'C': 1.0, 'gamma': 0.125}
0.853162076848


In [10]:
# KNN
k_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
#k_list = [1,5,8]
params = {"n_neighbors": k_list}
knn = KNeighborsClassifier(n_neighbors=26)
kgs=GridSearchCV(knn, params)
knn.fit(X_train_val, Y_train_val)
knn.predict(X_test)

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [11]:
# compute accuracy
test_acc = np.sum(knn.predict(X_test)==Y_test) / len(X_test)
print(test_acc)

0.837705453358


In [8]:
# Random Forest
rf = RandomForestClassifier(n_estimators=1024, random_state=0)
rf.fit(X_train_val, Y_train_val)
rf.predict(X_test)

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [9]:
# compute accuracy
test_acc = np.sum(rf.predict(X_test)==Y_test) / len(X_test)
print(test_acc)

0.848372700555
