# SVM for Road Segments

In [1]:
import pandas as pd 
import numpy as np
import pylab as pl
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
import scipy.stats as sc
import math
import warnings
import os
import matplotlib.pyplot as plt


warnings.filterwarnings("ignore")

pl.style.use('fivethirtyeight')
pl.rcParams['figure.figsize'] = (16,16)

In [5]:
columns = ['road_id', 'road_length', 'record_count', 'mean_SndMean', 'mean_lux', 'mean_acel', 

          'mean_isloud', 'mean_isdark', 'mean_bumps', 'sum_isloud', 'sum_isdark', 'sum_bumps', 

          'rn_mean_SndMean', 'rn_mean_lux', 'rn_mean_acel', 'rn_mean_isloud', 'rn_mean_isdark', 

          'rn_mean_bumps', 'rn_sum_isloud', 'rn_sum_isdark', 'rn_sum_bumps']
predictors_sum = [columns.index('sum_isloud'),columns.index('sum_isdark'),columns.index('sum_bumps')]
predictors_mean = [columns.index('mean_isloud'),columns.index('mean_isdark'),columns.index('mean_bumps')]
predictors_rn = [columns.index('rn_mean_isloud'), columns.index('rn_mean_isdark'), columns.index('rn_mean_bumps')]

x_train = np.load('../data/segments/X_train.npy')
y_train = np.load('../data/segments/y_train.npy')
x_test = np.load('../data/segments/x_test.npy')
y_test = np.load('../data/segments/y_test.npy')
x_valid = np.load('../data/segments/x_valid.npy')
y_valid = np.load('../data/segments/y_valid.npy')


In [6]:
x_predictors_sum_train = x_train[:,predictors_sum]
x_predictors_sum_test = x_test[:,predictors_sum]
x_predictors_sum_valid = x_valid[:,predictors_sum]

x_predictors_mean_train = x_train[:,predictors_mean]
x_predictors_mean_test = x_test[:,predictors_mean]
x_predictors_mean_valid = x_valid[:,predictors_mean]

x_predictors_rn_train = x_train[:,predictors_rn]
x_predictors_rn_test = x_test[:,predictors_rn]
x_predictors_rn_valid = x_valid[:,predictors_rn]


## Data Exploration

In [None]:
x_train.shape

In [None]:
foo = np.hstack([x_train, y_train.reshape(816,1)])
#foo = foo[:,3:]

In [None]:
zeros = foo[foo[:,21] == 0]
ones = foo[foo[:,21] == 1]
twos = foo[foo[:,21] == 2]
threes = foo[foo[:,21] == 3]

In [None]:
len(zeros), len(ones), len(twos), len(threes)

In [None]:
sc.describe(twos)

### Kernels

In [7]:
#Linear Kernel
bool_clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=1.0, class_weight='balanced'))
bool_clf.fit(x_predictors_sum_train, y_train)
sumlinear_y_pred = bool_clf.predict(x_predictors_sum_test)
right=1.0*(sumlinear_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a linear kernel".format((right)*100)
os.system('say "First Linear Done"');

we successfully(OS) predict the 36.4303178484 percent of data using a linear kernel


In [8]:
#Linear Kernel
bool_clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=1.0, class_weight='balanced'))
bool_clf.fit(x_predictors_mean_train, y_train)
linear_y_pred = bool_clf.predict(x_predictors_mean_test)
right=1.0*(linear_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a linear kernel".format((right)*100)
os.system('say "First Linear Done"');

we successfully(OS) predict the 27.3838630807 percent of data using a linear kernel


In [9]:
#Linear Kernel
bool_clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=1.0, class_weight='balanced'))
bool_clf.fit(x_predictors_rn_train, y_train)
linear_y_pred = bool_clf.predict(x_predictors_rn_test)
right=1.0*(linear_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a linear kernel".format((right)*100)
os.system('say "First Linear Done"');

we successfully(OS) predict the 41.075794621 percent of data using a linear kernel


In [11]:
pd.unique(y_train), pd.unique(linear_y_pred)#, pd.unique(rbf_high_y_pred)

(array([2, 0, 1, 3]), array([1]))

In [None]:
# RBF kernel with gamma = auto (1/4 in this case)
bool_rbf_clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1.0, class_weight='balanced'))
bool_rbf_clf.fit(x_predictors_train, y_train)
rbf_y_pred = bool_rbf_clf.predict(x_predictors_test)

right=1.0*(rbf_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)
os.system('say "First RBF Done"');

In [None]:
# RBF kernel with gamma = 0.5
bool_rbf_high_clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1.0, gamma = 0.5, class_weight='balanced'))
bool_rbf_high_clf.fit(x_predictors_train, y_train)
rbf_high_y_pred = bool_rbf_high_clf.predict(x_predictors_test)

right=1.0*(rbf_high_y_pred==np.asarray(y_test)).sum()/len(y_test)
print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)
os.system('say "First RBF Done"');

In [None]:
bool_poly_clf = OneVsRestClassifier(svm.SVC(kernel='poly', C=1.0, class_weight='balanced'))
bool_poly_clf.fit(x_predictors_train, y_train)
poly_y_pred = bool_poly_clf.predict(x_predictors_test)

right=1.0*(poly_y_pred==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a poly kernel".format((right)*100)

In [None]:
#plt.subplots_adjust(wspace=0.4, hspace=0.4)

#Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
#Z = Z.reshape(xx.shape)
plt.contourf(x_predictors_train, x_predictors_train, bool_poly_clf, cmap=plt.cm.Paired, alpha=0.8)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title(titles[i])

plt.show()


## Tuning Parameter

In [None]:
tuned_parameters = [ {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]} ]

validate_clf = GridSearchCV(svm.SVC(C=1, class_weight='balanced'), tuned_parameters, cv=5,
                       scoring='%s_weighted' % 'recall')
validate_clf.fit(x_predictors_train, y_train)

validate_clf.best_params_

C = np.linspace(-5,50,10)
C=[math.exp(i) for i in C]
OS_validation=[]
for c in C:
    clf = svm.SVC(kernel='linear',C=c) 
    clf.fit(x_predictors_train, y_train)
    right=1.0*(clf.predict(x_predictors_valid)==np.asarray(y_valid)).sum()/len(y_valid)
    OS_validation.append(right)
    
temp=pd.DataFrame([C,OS_validation]).T
ind=len(temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()])/2 
C_opt=temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()].iloc[ind,0]


C=[math.log(y,10) for y in C]# for a better graph
pylab.plot(C,OS_validation,'b',)
pylab.legend(loc='upper right')
pylab.ylabel('Accuracy')
pylab.xlabel('log(C)')
pylab.show()

print("The optimal C we found is:{}".format(C_opt)) 
''' '''