In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

In [2]:
# This converter function will convert the string into corresponding value.
#def converter1(x):
#    if x == b'A' or x == b'B' or x == b'C' or x == b'D' or x == b'E' or x == b'F' or x == b'G' or x == b'H' or x == b'I' or x == b'J' or x == b'K' or x == b'L' or x == b'M':       
#        return 1
#    else:
#        return 0
def converter1(x):
    if ord(x)-ord(b'M') > 0:
        return 0
    else:
        return 1

In [3]:
column1 = 0

data = np.loadtxt('letter-recognition.data', delimiter=',', 
               converters={column1:converter1})

In [4]:
print(data)
print(data.shape)

[[  0.   2.   8. ...,   8.   0.   8.]
 [  1.   5.  12. ...,   8.   4.  10.]
 [  1.   4.  11. ...,   7.   3.   9.]
 ..., 
 [  0.   6.   9. ...,  12.   2.   4.]
 [  0.   2.   3. ...,   9.   5.   8.]
 [  1.   4.   9. ...,   7.   2.   8.]]
(20000, 17)


In [5]:
# Divide data into training and testing sets
X_and_Y = data
np.random.shuffle(X_and_Y)

# Try to predict the last column (the cover type, which is 7 types)
X = X_and_Y[:,1:17]
Y = X_and_Y[:,0]
print(X.shape, Y.shape)

(20000, 16) (20000,)


In [6]:
X_train_val = X[0:5000,:]
X_test      = X[5000:19000,:]
Y_train_val = Y[0:5000]
Y_test      = Y[5000:19000]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(5000, 16) (14000, 16) (5000,) (14000,)


In [13]:
# SVM process

# linear
# polynomial degree 2 & 3 (this one is using 3)
# radial with width [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
# regularization parameter C = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

classifier = svm.SVC(kernel='linear')
C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]   
params = {"C": C_list}

GSR = GridSearchCV(classifier, params, return_train_score=True, cv=5, n_jobs=4)
GSR.fit(X_train_val,Y_train_val)
GSR.cv_results_
                               
# May also want to try rbf kernel with sigma (radial width values)

{'mean_fit_time': array([  3.70383263e-02,   3.86276722e-02,   5.84407330e-02,
          4.34304237e-02,   4.43315029e-02,   4.99347687e-02,
          7.35552788e-02,   2.37370157e-01,   1.69752569e+00,
          1.88658644e+01,   1.78775804e+02]),
 'mean_score_time': array([ 0.00690942,  0.00650473,  0.00850596,  0.00970697,  0.00830665,
         0.01120834,  0.01091342,  0.02001472,  0.03062162,  0.04613509,
         0.04054713]),
 'mean_test_score': array([ 0.9642,  0.9642,  0.9642,  0.9642,  0.9642,  0.9642,  0.9642,
         0.9642,  0.9642,  0.9642,  0.9642]),
 'mean_train_score': array([ 0.96420001,  0.96420001,  0.96420001,  0.96420001,  0.96420001,
         0.96420001,  0.96420001,  0.96420001,  0.96420001,  0.96420001,
         0.96420001]),
 'param_C': masked_array(data = [1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0],
              mask = [False False False False False False False False False False False],
        fill_value = ?),
 'params': ({'C': 1e-07},


In [14]:
# find best C and best accuracy
test_acc = sum(GSR.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print(GSR.best_params_)
print(test_acc)

{'C': 1e-07}
0.961714285714


In [19]:
def convert_to_gamma(radial_width):
    converted = [0]*len(radial_width)
    for i in range(len(radial_width)):
        gamma = 1/(2*(radial_width[i]**2))
        converted[i] = gamma
    return converted

radial_width = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
gamma_list = convert_to_gamma(radial_width)

# SVM process

# rbf
# polynomial degree 2 & 3 (this one is using 3)
# radial with width [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
# regularization parameter C = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

classifier = svm.SVC(kernel='rbf')
C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]
params = {"C": C_list, "gamma": gamma_list}

GSR = GridSearchCV(classifier, params, return_train_score=True, cv=5, n_jobs=4)
GSR.fit(X_train_val,Y_train_val)
GSR.cv_results_
                               
# May also want to try rbf kernel with sigma (radial width values)

{'mean_fit_time': array([ 2.53458781,  2.36296263,  2.4052002 ,  2.82642293,  2.73883524,
         4.46555672,  2.88594027,  1.4301106 ,  2.37988253,  2.35976839,
         2.35746675,  2.3574667 ,  2.20906124,  4.4520472 ,  2.90835624,
         1.42210541,  2.4045001 ,  2.43665571,  2.40139718,  3.0479497 ,
         2.52610955,  4.94483957,  3.02673993,  1.37977557,  2.32194128,
         2.26850352,  2.58953085,  3.00192246,  2.50938597,  4.82162018,
         2.85501838,  1.40259137,  2.28272862,  2.77700806,  2.82642608,
         2.46334481,  2.53743715,  4.37559299,  2.7790648 ,  1.40349231,
         2.66101775,  2.66433191,  2.68170414,  2.74223852,  2.76545534,
         5.13494759,  3.31536193,  1.63737001,  2.75363612,  2.43422813,
         1.7357461 ,  1.73084326,  1.64518175,  3.90600238,  2.45649662,
         1.1003716 ,  1.75699377,  1.77200265,  1.69588032,  1.73682814,
         1.74295955,  4.26616778,  3.06262794,  1.40759945,  1.81323447,
         1.8045908 ,  1.78188753, 

In [20]:
# find best C and best accuracy
test_acc = sum(GSR.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print(GSR.best_params_)
print(test_acc)

{'C': 10.0, 'gamma': 0.125}
0.967071428571


In [7]:
# KNN
k_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
#k_list = [1,5,8]
params = {"n_neighbors": k_list}
knn = KNeighborsClassifier(n_neighbors=26)
kgs=GridSearchCV(knn, params)
knn.fit(X_train_val, Y_train_val)
knn.predict(X_test)

array([ 1.,  0.,  1., ...,  0.,  1.,  1.])

In [8]:
# compute accuracy
test_acc = np.sum(knn.predict(X_test)==Y_test) / len(X_test)
print(test_acc)

0.893857142857


In [9]:
# Random Forest
rf = RandomForestClassifier(n_estimators=1024, random_state=0)
rf.fit(X_train_val, Y_train_val)
rf.predict(X_test)

array([ 1.,  0.,  1., ...,  0.,  1.,  1.])

In [10]:
# compute accuracy
test_acc = np.sum(rf.predict(X_test)==Y_test) / len(X_test)
print(test_acc)

0.948428571429
