# Support Vector Machines

Use SVM to seperate the predict land use type

In [7]:
import pandas as pd 
import numpy as np
import pylab as pl
from sklearn import svm
from sklearn.grid_search import GridSearchCV
import math

pl.style.use('fivethirtyeight')
pl.rcParams['figure.figsize'] = (16,16)

In [2]:
columns = ['ID', 'GPS_DATETIMESTAMP', 'GPS_LAT', 'GPS_LON', 'GPS_Speed', 'GPS_Alt', 'GPS_Sats', 'GPS_Fix', 'GPS_Quality', 'AMB_Temp', 'AMB_Humd', 'AMB_Lux', 'AMB_Snd', 'AMB_SndMin', 'AMB_SndMax', 'AMB_SndMea', 'RDQ_AcX', 'RDQ_AcXMin', 'RDQ_AcXMax', 'RDQ_AcXMea', 'RDQ_AcY', 'RDQ_AcYMin', 'RDQ_AcYMax', 'RDQ_AcYMea', 'RDQ_AcZ', 'RDQ_AcZMin', 'RDQ_AcZMax', 'RDQ_AcZMea', 'sensor', 'timestamp', 'hourOfDay', 'minuteOfDay', 'minuteStretched', 'LU05_DESC', 'LUCODE', 'merge_key', 'day', 'sensor_num', 'sensor_day', 'keep_SndMean', 'is_loud', 'is_dark', 'acel', 'bumpflag', 'latlon_match']
predictors = [columns.index('is_loud'), columns.index('is_dark'), columns.index('bumpflag')]

x_train = np.load('../data/x_train.npy')
y_train = np.load('../data/y_train.npy')
x_test = np.load('../data/x_test.npy')
y_test = np.load('../data/y_test.npy')
x_valid = np.load('../data/x_valid.npy')
y_valid = np.load('../data/y_valid.npy')


In [3]:
x_predictors_train = x_train[:,predictors]
x_predictors_test = x_test[:,predictors]
x_predictors_valid = x_valid[:,predictors]


### Training over several types of kernels

In [4]:
clf = svm.SVC(kernel='linear',C=1.0) 
clf.fit(x_predictors_train, y_train)

right=1.0*(clf.predict(x_predictors_test)==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a linear kernel".format((right)*100)

we successfully(OS) predict the 65.4633818315 percent of data using a linear kernel


In [5]:
rbf_clf = svm.SVC(kernel='rbf',C=1.0) 
rbf_clf.fit(x_predictors_train, y_train)

right=1.0*(rbf_clf.predict(x_predictors_test)==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)

we successfully(OS) predict the 65.5061743489 percent of data using a rbf kernel


In [6]:
poly_clf = svm.SVC(kernel='poly',C=1.0, degree = 3) 
poly_clf.fit(x_predictors_train, y_train)

right=1.0*(poly_clf.predict(x_predictors_test)==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a poly kernel".format((right)*100)

we successfully(OS) predict the 65.5061743489 percent of data using a poly kernel


### Tuning  parameters


#### Linear Kernal

In [8]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [11]:
validate_clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_weighted' % 'recall')
validate_clf.fit(x_predictors_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='recall_weighted',
       verbose=0)

In [12]:
validate_clf.best_params_

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}

In [13]:
rbf_clf = svm.SVC(kernel='rbf',C=1.0) 
rbf_clf.fit(x_predictors_train, y_train)

right=1.0*(rbf_clf.predict(x_predictors_test)==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)

we successfully(OS) predict the 65.4633818315 percent of data using a rbf kernel


In [None]:
C = np.linspace(-5,50,10)
C=[math.exp(i) for i in C]
OS_validation=[]
for c in C:
    clf = svm.SVC(kernel='linear',C=c) 
    clf.fit(x_predictors_train, y_train)
    right=1.0*(clf.predict(x_predictors_valid)==np.asarray(y_valid)).sum()/len(y_valid)
    OS_validation.append(right)
    
temp=pd.DataFrame([C,OS_validation]).T
ind=len(temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()])/2 
C_opt=temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()].iloc[ind,0]


C=[math.log(y,10) for y in C]# for a better graph
pylab.plot(C,OS_validation,'b',)
pylab.legend(loc='upper right')
pylab.ylabel('Accuracy')
pylab.xlabel('log(C)')
pylab.show()

print("The optimal C we found is:{}".format(C_opt)) 
