In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [2]:
data = np.loadtxt('covtype.data', delimiter=',')
#data = np.loadtxt('covtype_7000.data', delimiter=',')

In [3]:
# Find the most common classes
count = Counter(data[:,54])
print(count.most_common(3))

[(2.0, 283301), (1.0, 211840), (3.0, 35754)]


In [4]:
for i in range(len(data[:,54])):
    if data[i,54] == 2:
        data[i,54] = 1
    else:
        data[i,54] = 0

In [5]:
print(data)
print(data.shape)

data_continuous = data[:,0:10]
d_s = StandardScaler()
d_n = d_s.fit_transform(data_continuous)

X_and_Y = np.hstack([d_n, data[:,10:56]])
print(X_and_Y)

[[  2.59600000e+03   5.10000000e+01   3.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  2.59000000e+03   5.60000000e+01   2.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  2.80400000e+03   1.39000000e+02   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   1.00000000e+00]
 ..., 
 [  2.38600000e+03   1.59000000e+02   1.70000000e+01 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  2.38400000e+03   1.70000000e+02   1.50000000e+01 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  2.38300000e+03   1.65000000e+02   1.30000000e+01 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
(581012, 55)
[[-1.29780509 -0.93515698 -1.48281978 ...,  0.          0.          0.        ]
 [-1.31923485 -0.89047967 -1.61636259 ...,  0.          0.          0.        ]
 [-0.5549068  -0.14883628 -0.68156292 ...,  0.          0.          1.        ]
 ..., 
 [-2.04784663  0.02987297  0.38677957 ...,  0.          0.  

In [6]:
# Divide data into training and testing sets

np.random.shuffle(X_and_Y)

# Try to predict the last column (the cover type, which is 7 types)
# NOTE: the largest number is "positive", while the other ones count as negative. This way it is a binary classification problem
X = X_and_Y[:,0:-1]
Y = X_and_Y[:,-1]
print(X.shape, Y.shape)

(581012, 54) (581012,)


In [7]:
X_train_val = X[0:5000,:]
X_test      = X[5000:30000,:]
#X_test      = X[5000:7000,:]
Y_train_val = Y[0:5000]
Y_test      = Y[5000:30000]
#Y_test      = Y[5000:7000]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(5000, 54) (25000, 54) (5000,) (25000,)


In [8]:
# SVM process

# linear
# polynomial degree 2 & 3 (this one is using 3)
# radial with width [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
# regularization parameter C = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

classifier = svm.SVC(kernel='linear')
C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]   
params = {"C": C_list}

GSR = GridSearchCV(classifier, params, return_train_score=True, cv=5, n_jobs=4)
GSR.fit(X_train_val,Y_train_val)
GSR.cv_results_
                               
# May also want to try rbf kernel with sigma (radial width values)

{'mean_fit_time': array([   1.8838346 ,    1.79109101,    1.71864409,    1.69016204,
           1.69146276,    1.45164909,    1.297435  ,    1.76377645,
           4.45754828,   24.55549097,  210.90532379]),
 'mean_score_time': array([ 0.26519251,  0.25859628,  0.284338  ,  0.26285772,  0.27409019,
         0.2054461 ,  0.16651912,  0.15550966,  0.16091547,  0.15922394,
         0.14641438]),
 'mean_test_score': array([ 0.5048,  0.5048,  0.5048,  0.5048,  0.6262,  0.7056,  0.7312,
         0.7446,  0.7464,  0.747 ,  0.7472]),
 'mean_train_score': array([ 0.5048    ,  0.5048    ,  0.5048    ,  0.5048    ,  0.63285004,
         0.70980001,  0.73959973,  0.74989974,  0.75299984,  0.75284986,
         0.75314984]),
 'param_C': masked_array(data = [1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0],
              mask = [False False False False False False False False False False False],
        fill_value = ?),
 'params': ({'C': 1e-07},
  {'C': 1e-06},
  {'C': 1e-05},
  {'C': 0

In [9]:
# find best C and best accuracy
test_acc = sum(GSR.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print(GSR.best_params_)
print(test_acc)

{'C': 1000.0}
0.76632


In [10]:
def convert_to_gamma(radial_width):
    converted = [0]*len(radial_width)
    for i in range(len(radial_width)):
        gamma = 1/(2*(radial_width[i]**2))
        converted[i] = gamma
    return converted

radial_width = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
gamma_list = convert_to_gamma(radial_width)

# SVM process

# rbf
# polynomial degree 2 & 3 (this one is using 3)
# radial with width [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
# regularization parameter C = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

classifier = svm.SVC(kernel='rbf')
C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]
params = {"C": C_list, "gamma": gamma_list}

GSR = GridSearchCV(classifier, params, return_train_score=True, cv=5, n_jobs=4)
GSR.fit(X_train_val,Y_train_val)
GSR.cv_results_
                               
# May also want to try rbf kernel with sigma (radial width values)

{'mean_fit_time': array([ 2.6389852 ,  2.71703582,  2.4765708 ,  2.44577107,  4.22528439,
         2.74856963,  3.23876948,  3.15837216,  3.47074661,  3.50964012,
         3.40196342,  3.53794236,  5.31584902,  3.06383729,  3.0193841 ,
         3.12097249,  3.44260011,  3.39284163,  3.47327776,  3.46449566,
         5.21394472,  3.16329665,  3.55353236,  3.5967783 ,  3.27437882,
         3.15645895,  2.65938148,  2.99612489,  4.69057117,  3.15892467,
         3.09785371,  2.81071296,  2.82571511,  2.50103469,  2.4555398 ,
         2.42424517,  4.10610266,  2.18996797,  3.04214325,  2.26646376,
         2.50110116,  2.74409657,  3.19473324,  2.71736555,  4.45013742,
         2.42022877,  2.36866794,  2.7874342 ,  3.05485301,  2.74407086,
         3.50891223,  4.12242446,  4.71106238,  3.3242352 ,  3.19047112,
         2.25265374,  2.90985289,  3.55724797,  3.08478279,  2.93212047,
         5.11942677,  3.27328444,  2.10441766,  1.59667606,  3.67543912,
         2.62660627,  2.6632822 , 

In [11]:
# find best C and best accuracy
test_acc = sum(GSR.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print(GSR.best_params_)
print(test_acc)

{'C': 1.0, 'gamma': 0.5}
0.80668


In [8]:
# KNN
k_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
#k_list = [1,5,8]
params = {"n_neighbors": k_list}
knn = KNeighborsClassifier(n_neighbors=26)
kgs=GridSearchCV(knn, params)
knn.fit(X_train_val, Y_train_val)
knn.predict(X_test)

array([ 1.,  1.,  0., ...,  1.,  1.,  0.])

In [9]:
# compute accuracy
test_acc = np.sum(knn.predict(X_test)==Y_test) / len(X_test)
print(test_acc)

0.75692


In [10]:
# Random Forest
rf = RandomForestClassifier(n_estimators=1024, random_state=0, n_jobs=4)
rf.fit(X_train_val, Y_train_val)
rf.predict(X_test)

array([ 1.,  1.,  0., ...,  1.,  1.,  0.])

In [11]:
# compute accuracy
test_acc = np.sum(rf.predict(X_test)==Y_test) / len(X_test)
print(test_acc)

0.82124
