In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

In [2]:
# This converter function will convert the string into corresponding value.
def converter1(x):
    if x == b'O':       
        return 1
    else:
        return 0

In [3]:
column1 = 0

data = np.loadtxt('letter-recognition.data', delimiter=',', 
               converters={column1:converter1})

In [4]:
print(data)
print(data.shape)

[[  0.   2.   8. ...,   8.   0.   8.]
 [  0.   5.  12. ...,   8.   4.  10.]
 [  0.   4.  11. ...,   7.   3.   9.]
 ..., 
 [  0.   6.   9. ...,  12.   2.   4.]
 [  0.   2.   3. ...,   9.   5.   8.]
 [  0.   4.   9. ...,   7.   2.   8.]]
(20000, 17)


In [5]:
# Divide data into training and testing sets
X_and_Y = data
np.random.shuffle(X_and_Y)

# Try to predict the last column (the cover type, which is 7 types)
X = X_and_Y[:,1:17]
Y = X_and_Y[:,0]
print(X.shape, Y.shape)

(20000, 16) (20000,)


In [6]:
X_train_val = X[0:5000,:]
X_test      = X[5000:19000,:]
Y_train_val = Y[0:5000]
Y_test      = Y[5000:19000]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(5000, 16) (14000, 16) (5000,) (14000,)


In [13]:
# SVM process

# linear
# polynomial degree 2 & 3 (this one is using 3)
# radial with width [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
# regularization parameter C = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

classifier = svm.SVC(kernel='linear')
C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]   
params = {"C": C_list}

GSR = GridSearchCV(classifier, params, return_train_score=True, cv=5, n_jobs=4)
GSR.fit(X_train_val,Y_train_val)
GSR.cv_results_
                               
# May also want to try rbf kernel with sigma (radial width values)

{'mean_fit_time': array([  3.70383263e-02,   3.86276722e-02,   5.84407330e-02,
          4.34304237e-02,   4.43315029e-02,   4.99347687e-02,
          7.35552788e-02,   2.37370157e-01,   1.69752569e+00,
          1.88658644e+01,   1.78775804e+02]),
 'mean_score_time': array([ 0.00690942,  0.00650473,  0.00850596,  0.00970697,  0.00830665,
         0.01120834,  0.01091342,  0.02001472,  0.03062162,  0.04613509,
         0.04054713]),
 'mean_test_score': array([ 0.9642,  0.9642,  0.9642,  0.9642,  0.9642,  0.9642,  0.9642,
         0.9642,  0.9642,  0.9642,  0.9642]),
 'mean_train_score': array([ 0.96420001,  0.96420001,  0.96420001,  0.96420001,  0.96420001,
         0.96420001,  0.96420001,  0.96420001,  0.96420001,  0.96420001,
         0.96420001]),
 'param_C': masked_array(data = [1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0],
              mask = [False False False False False False False False False False False],
        fill_value = ?),
 'params': ({'C': 1e-07},


In [14]:
# find best C and best accuracy
test_acc = sum(GSR.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print(GSR.best_params_)
print(test_acc)

{'C': 1e-07}
0.961714285714


In [8]:
def convert_to_gamma(radial_width):
    converted = [0]*len(radial_width)
    for i in range(len(radial_width)):
        gamma = 1/(2*(radial_width[i]**2))
        converted[i] = gamma
    return converted

radial_width = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
gamma_list = convert_to_gamma(radial_width)

# SVM process

# rbf
# polynomial degree 2 & 3 (this one is using 3)
# radial with width [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
# regularization parameter C = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

classifier = svm.SVC(kernel='rbf')
C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]
params = {"C": C_list, "gamma": gamma_list}

GSR = GridSearchCV(classifier, params, return_train_score=True, cv=5, n_jobs=4)
GSR.fit(X_train_val,Y_train_val)
GSR.cv_results_
                               
# May also want to try rbf kernel with sigma (radial width values)

{'mean_fit_time': array([ 0.11548467,  0.12578888,  0.13249388,  0.13169284,  0.14029865,
         0.30911932,  0.15651007,  0.08736134,  0.11848836,  0.12709227,
         0.13869748,  0.120085  ,  0.14170041,  0.31322155,  0.16141458,
         0.08626003,  0.11552515,  0.12816458,  0.11458039,  0.11518145,
         0.13759713,  0.29530883,  0.16082182,  0.11197891,  0.18863344,
         0.23086314,  0.25237913,  0.23786931,  0.20634589,  0.38186994,
         0.21735368,  0.12869086,  0.32763162,  0.35314975,  0.33663063,
         0.29990778,  0.31041937,  0.49244761,  0.19013438,  0.10787621,
         1.66827903,  1.73652754,  2.00246572,  2.07558403,  1.79296684,
         3.15312905,  1.95618291,  0.32533002,  2.42071085,  2.48335443,
         2.49396324,  2.50216875,  2.49536376,  4.59995232,  3.03744712,
         0.99160075,  2.68009424,  2.82960029,  2.84781299,  2.82769847,
         3.65718732,  5.27523098,  3.93901377,  1.3034214 ,  2.84270945,
         2.84961448,  2.85141554, 

In [9]:
# find best C and best accuracy
test_acc = sum(GSR.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print(GSR.best_params_)
print(test_acc)

{'C': 10.0, 'gamma': 0.125}
0.9915


In [7]:
# KNN
k_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
#k_list = [1,5,8]
params = {"n_neighbors": k_list}
knn = KNeighborsClassifier(n_neighbors=26)
kgs=GridSearchCV(knn, params)
knn.fit(X_train_val, Y_train_val)
knn.predict(X_test)

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [8]:
# compute accuracy
test_acc = np.sum(knn.predict(X_test)==Y_test) / len(X_test)
print(test_acc)

0.981071428571


In [9]:
# Random Forest
rf = RandomForestClassifier(n_estimators=1024, random_state=0)
rf.fit(X_train_val, Y_train_val)
rf.predict(X_test)

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [10]:
# compute accuracy
test_acc = np.sum(rf.predict(X_test)==Y_test) / len(X_test)
print(test_acc)

0.986357142857
