# SVM for Road Segments

In [44]:
import pandas as pd 
import numpy as np
import pylab as pl
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
import scipy.stats as sc
import math
import warnings
import os


warnings.filterwarnings("ignore")

pl.style.use('fivethirtyeight')
pl.rcParams['figure.figsize'] = (16,16)

In [73]:
columns = ['road_id', 'road_length', 'record_count', 'mean_SndMean', 'mean_lux', 'mean_acel', 

          'mean_isloud', 'mean_isdark', 'mean_bumps', 'sum_isloud', 'sum_isdark', 'sum_bumps', 

          'rn_mean_SndMean', 'rn_mean_lux', 'rn_mean_acel', 'rn_mean_isloud', 'rn_mean_isdark', 

          'rn_mean_bumps', 'rn_sum_isloud', 'rn_sum_isdark', 'rn_sum_bumps']
predictors = [columns.index('rn_mean_SndMean'), columns.index('rn_mean_isdark'), columns.index('rn_mean_bumps')]

x_train = np.load('../data/segments/X_train.npy')
y_train = np.load('../data/segments/y_train.npy')
x_test = np.load('../data/segments/x_test.npy')
y_test = np.load('../data/segments/y_test.npy')
x_valid = np.load('../data/segments/x_valid.npy')
y_valid = np.load('../data/segments/y_valid.npy')


In [8]:
x_predictors_train = x_train[:,predictors]
x_predictors_test = x_test[:,predictors]
x_predictors_valid = x_valid[:,predictors]


## Data Exploration

In [85]:
x_train.shape

(816, 21)

In [86]:
foo = np.hstack([x_train, y_train.reshape(816,1)])
#foo = foo[:,3:]

In [97]:
zeros = foo[foo[:,21] == 0]
ones = foo[foo[:,21] == 1]
twos = foo[foo[:,21] == 2]
threes = foo[foo[:,21] == 3]

In [99]:
len(zeros), len(ones), len(twos), len(threes)

(319, 388, 55, 54)

In [100]:
sc.describe(zeros)

DescribeResult(nobs=319, minmax=(array([  2.30000000e+01,   7.26497627e+00,   6.00000000e+00,
         1.92886169e+02,   2.11764706e+00,   8.90901995e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.17459051e-01,   1.86858035e-02,   9.99411384e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00]), array([  4.89796000e+05,   9.09470516e+02,   1.45300000e+03,
         5.24598726e+02,   2.87285484e+04,   9.90843739e+00,
         1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
         5.26000000e+02,   1.26400000e+03,   2.47000000e+02,
         6.40295707e+01,   8.10789974e+02,   1.25873914e+00,
         6.08206343e-02,   1.26014653e-01,   4.38724563e-02,
         5.58701143e+00,   2.46657909e+01,   4.30784446e+00,
         0.00000000e+00])), mean=array([  2.41132270e+05,   9.43453842e+01,   8.999686

In [91]:
sc.describe(ones)

DescribeResult(nobs=388, minmax=(array([  4.54000000e+02,   1.33554997e+01,   6.00000000e+00,
         5.07242703e+02,   3.19178082e+00,   8.87706919e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.12794729e+00,   1.43987697e-02,   1.99577399e-02,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00]), array([  4.84460000e+05,   4.52610911e+02,   7.85000000e+02,
         5.25130000e+02,   2.59000000e+04,   9.94279386e+00,
         1.00000000e+00,   1.00000000e+00,   8.33333333e-01,
         1.15000000e+02,   6.41000000e+02,   2.06000000e+02,
         3.81335680e+01,   1.20450072e+03,   7.36990297e-01,
         2.51122387e-02,   7.48755209e-02,   1.79264517e-02,
         8.69289017e-01,   7.31475126e+00,   1.57555043e+00,
         1.00000000e+00])), mean=array([  2.35792198e+05,   1.06545025e+02,   4.561855

In [92]:
sc.describe(twos)

DescribeResult(nobs=55, minmax=(array([  1.31970000e+04,   2.47337757e+01,   6.00000000e+00,
         1.94063333e+02,   2.16666667e+00,   8.81779467e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.28923434e+00,   1.29091409e-02,   2.30098746e-02,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         2.00000000e+00]), array([  4.81977000e+05,   3.95543654e+02,   4.19000000e+02,
         5.12237742e+02,   2.61302051e+04,   9.15576408e+00,
         6.36363636e-01,   1.00000000e+00,   3.33333333e-01,
         1.08000000e+02,   4.19000000e+02,   6.00000000e+00,
         2.06069540e+01,   9.22459515e+02,   3.64722241e-01,
         1.30049301e-02,   3.99705143e-02,   2.77398822e-03,
         4.24189983e-01,   5.97037792e+00,   4.16098233e-02,
         2.00000000e+00])), mean=array([  2.59087564e+05,   1.22868800e+02,   4.3690909

In [9]:
#Linear Kernel
bool_clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=1.0, class_weight='balanced'))
bool_clf.fit(x_predictors_train, y_train)
linear_y_pred = bool_clf.predict(x_predictors_test)
right=1.0*(linear_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a linear kernel".format((right)*100)
os.system('say "First Linear Done"');

we successfully(OS) predict the 34.2298288509 percent of data using a linear kernel


In [101]:
# RBF kernel with gamma = auto (1/4 in this case)
bool_rbf_clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1.0, class_weight='balanced'))
bool_rbf_clf.fit(x_predictors_train, y_train)
rbf_y_pred = bool_rbf_clf.predict(x_predictors_test)

right=1.0*(rbf_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)
os.system('say "First RBF Done"');

we successfully(OS) predict the 24.6943765281 percent of data using a rbf kernel


In [103]:
# RBF kernel with gamma = 0.5
bool_rbf_high_clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1.0, gamma = 0.5, class_weight='balanced'))
bool_rbf_high_clf.fit(x_predictors_train, y_train)
rbf_high_y_pred = bool_rbf_high_clf.predict(x_predictors_test)

right=1.0*(rbf_high_y_pred==np.asarray(y_test)).sum()/len(y_test)
print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)
os.system('say "First RBF Done"');

we successfully(OS) predict the 27.3838630807 percent of data using a rbf kernel


In [104]:
bool_poly_clf = OneVsRestClassifier(svm.SVC(kernel='poly', C=1.0, class_weight='balanced'))
bool_poly_clf.fit(x_predictors_train, y_train)
poly_y_pred = bool_poly_clf.predict(x_predictors_test)

right=1.0*(poly_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a poly kernel".format((right)*100)

we successfully(OS) predict the 24.2053789731 percent of data using a poly kernel
