# SVM for Road Segments

In [1]:
import pandas as pd 
import numpy as np
import pylab as pl
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
import scipy.stats as sc
import math
import warnings
import os


warnings.filterwarnings("ignore")

pl.style.use('fivethirtyeight')
pl.rcParams['figure.figsize'] = (16,16)

In [2]:
columns = ['road_id', 'road_length', 'record_count', 'mean_SndMean', 'mean_lux', 'mean_acel', 

          'mean_isloud', 'mean_isdark', 'mean_bumps', 'sum_isloud', 'sum_isdark', 'sum_bumps', 

          'rn_mean_SndMean', 'rn_mean_lux', 'rn_mean_acel', 'rn_mean_isloud', 'rn_mean_isdark', 

          'rn_mean_bumps', 'rn_sum_isloud', 'rn_sum_isdark', 'rn_sum_bumps']
predictors = [columns.index('sum_isloud'),columns.index('sum_isdark'),columns.index('sum_bumps'), columns.index('mean_isloud'),columns.index('mean_isdark'),columns.index('mean_bumps'),columns.index('rn_mean_isloud'), columns.index('rn_mean_isdark'), columns.index('rn_mean_bumps')]

x_train = np.load('../data/segments/X_train.npy')
y_train = np.load('../data/segments/y_train.npy')
x_test = np.load('../data/segments/x_test.npy')
y_test = np.load('../data/segments/y_test.npy')
x_valid = np.load('../data/segments/x_valid.npy')
y_valid = np.load('../data/segments/y_valid.npy')


In [3]:
x_predictors_train = x_train[:,predictors]
x_predictors_test = x_test[:,predictors]
x_predictors_valid = x_valid[:,predictors]


## Data Exploration

In [4]:
x_train.shape

(816, 21)

In [5]:
foo = np.hstack([x_train, y_train.reshape(816,1)])
#foo = foo[:,3:]

In [6]:
zeros = foo[foo[:,21] == 0]
ones = foo[foo[:,21] == 1]
twos = foo[foo[:,21] == 2]
threes = foo[foo[:,21] == 3]

In [7]:
len(zeros), len(ones), len(twos), len(threes)

(319, 388, 55, 54)

In [8]:
sc.describe(twos)

DescribeResult(nobs=55, minmax=(array([  1.31970000e+04,   2.47337757e+01,   6.00000000e+00,
         1.94063333e+02,   2.16666667e+00,   8.81779467e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.28923434e+00,   1.29091409e-02,   2.30098746e-02,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         2.00000000e+00]), array([  4.81977000e+05,   3.95543654e+02,   4.19000000e+02,
         5.12237742e+02,   2.61302051e+04,   9.15576408e+00,
         6.36363636e-01,   1.00000000e+00,   3.33333333e-01,
         1.08000000e+02,   4.19000000e+02,   6.00000000e+00,
         2.06069540e+01,   9.22459515e+02,   3.64722241e-01,
         1.30049301e-02,   3.99705143e-02,   2.77398822e-03,
         4.24189983e-01,   5.97037792e+00,   4.16098233e-02,
         2.00000000e+00])), mean=array([  2.59087564e+05,   1.22868800e+02,   4.3690909

### Kernels

In [None]:
#Linear Kernel
bool_clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=10.0, class_weight='balanced'))
bool_clf.fit(x_predictors_train, y_train)
linear_y_pred = bool_clf.predict(x_predictors_test)
right=1.0*(linear_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a linear kernel".format((right)*100)
os.system('say "First Linear Done"');

In [None]:
pd.unique(y_train), pd.unique(linear_y_pred), pd.unique(rbf_high_y_pred)

In [None]:
# RBF kernel with gamma = auto (1/4 in this case)
bool_rbf_clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1.0, class_weight='balanced'))
bool_rbf_clf.fit(x_predictors_train, y_train)
rbf_y_pred = bool_rbf_clf.predict(x_predictors_test)

right=1.0*(rbf_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)
os.system('say "First RBF Done"');

In [None]:
# RBF kernel with gamma = 0.5
bool_rbf_high_clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1.0, gamma = 0.5, class_weight='balanced'))
bool_rbf_high_clf.fit(x_predictors_train, y_train)
rbf_high_y_pred = bool_rbf_high_clf.predict(x_predictors_test)

right=1.0*(rbf_high_y_pred==np.asarray(y_test)).sum()/len(y_test)
print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)
os.system('say "First RBF Done"');

In [None]:
bool_poly_clf = OneVsRestClassifier(svm.SVC(kernel='poly', C=1.0, class_weight='balanced'))
bool_poly_clf.fit(x_predictors_train, y_train)
poly_y_pred = bool_poly_clf.predict(x_predictors_test)

right=1.0*(poly_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a poly kernel".format((right)*100)

## Tuning Parameter

In [None]:
tuned_parameters = [ {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]} ]

validate_clf = GridSearchCV(svm.SVC(C=1, class_weight='balanced'), tuned_parameters, cv=5,
                       scoring='%s_weighted' % 'recall')
validate_clf.fit(x_predictors_train, y_train)

validate_clf.best_params_

C = np.linspace(-5,50,10)
C=[math.exp(i) for i in C]
OS_validation=[]
for c in C:
    clf = svm.SVC(kernel='linear',C=c) 
    clf.fit(x_predictors_train, y_train)
    right=1.0*(clf.predict(x_predictors_valid)==np.asarray(y_valid)).sum()/len(y_valid)
    OS_validation.append(right)
    
temp=pd.DataFrame([C,OS_validation]).T
ind=len(temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()])/2 
C_opt=temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()].iloc[ind,0]


C=[math.log(y,10) for y in C]# for a better graph
pylab.plot(C,OS_validation,'b',)
pylab.legend(loc='upper right')
pylab.ylabel('Accuracy')
pylab.xlabel('log(C)')
pylab.show()

print("The optimal C we found is:{}".format(C_opt)) 
''' '''