# Support Vector Machines

Use SVM to seperate the predict land use type

In [1]:
import pandas as pd 
import numpy as np
import pylab as pl
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
import math
import warnings
import os


warnings.filterwarnings("ignore")

pl.style.use('fivethirtyeight')
pl.rcParams['figure.figsize'] = (16,16)

In [2]:
columns = ['ID', 'GPS_DATETIMESTAMP', 'GPS_LAT', 'GPS_LON', 'GPS_Speed',\
           'GPS_Alt', 'GPS_Sats', 'GPS_Fix', 'GPS_Quality', 'AMB_Temp', \
           'AMB_Humd', 'AMB_Lux', 'AMB_Snd', 'AMB_SndMin', 'AMB_SndMax',\
           'AMB_SndMea', 'RDQ_AcX', 'RDQ_AcXMin', 'RDQ_AcXMax', 'RDQ_AcXMea',\
           'RDQ_AcY', 'RDQ_AcYMin', 'RDQ_AcYMax', 'RDQ_AcYMea', 'RDQ_AcZ',\
           'RDQ_AcZMin', 'RDQ_AcZMax', 'RDQ_AcZMea', 'sensor', 'timestamp',\
           'hourOfDay', 'minuteOfDay', 'minuteStretched', 'LU05_DESC',\
           'LUCODE', 'merge_key', 'day', 'sensor_num', 'sensor_day',\
           'keep_SndMean', 'is_loud', 'is_dark', 'acel', 'bumpflag', 'latlon_match']
predictors = [columns.index('is_loud'), columns.index('is_dark'), columns.index('bumpflag')]

x_train = np.load('../data/x_train.npy')
y_train = np.load('../data/y_train.npy')
x_test = np.load('../data/x_test.npy')
y_test = np.load('../data/y_test.npy')
x_valid = np.load('../data/x_valid.npy')
y_valid = np.load('../data/y_valid.npy')


In [3]:
x_predictors_train = x_train[:,predictors]
x_predictors_test = x_test[:,predictors]
x_predictors_valid = x_valid[:,predictors]


## Prediction using flags
    - is_loud
    - is_dark
    - is_bump

### Training over several types of kernels

In [4]:
clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=1.0, class_weight='balanced'))
clf.fit(x_predictors_train, y_train)

right=1.0*(clf.predict(x_predictors_test)==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a linear kernel".format((right)*100)

we successfully(OS) predict the 58.830541631 percent of data using a linear kernel


In [5]:
rbf_clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1.0, class_weight='balanced'))
rbf_clf.fit(x_predictors_train, y_train)

right=1.0*(rbf_clf.predict(x_predictors_test)==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a rbf kernel".format((right)*100)

we successfully(OS) predict the 58.530994009 percent of data using a rbf kernel


In [6]:
poly_clf = OneVsRestClassifier(svm.SVC(kernel='poly', C=1.0, class_weight='balanced'))
poly_clf.fit(x_predictors_train, y_train)

right=1.0*(poly_clf.predict(x_predictors_test)==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a poly kernel".format((right)*100)

we successfully(OS) predict the 53.1116273383 percent of data using a poly kernel


In [7]:
os.system('say "Completed prediction for flags"');

## Prediction using standard normal variables
    - Sound
    - Light
    - Z axis acceleration

In [8]:
#transforming sound values to standard normal
std_scaler = preprocessing.StandardScaler()
std_norm_sound_train = std_scaler.fit_transform(x_train[:,columns.index('AMB_SndMea')])
std_norm_sound_valid = std_scaler.fit_transform(x_valid[:,columns.index('AMB_SndMea')])
std_norm_sound_test = std_scaler.fit_transform(x_test[:,columns.index('AMB_SndMea')])


In [9]:
#transforming light values to standard normal
std_norm_light_train = std_scaler.fit_transform(x_train[:,columns.index('AMB_Lux')])
std_norm_light_valid = std_scaler.fit_transform(x_valid[:,columns.index('AMB_Lux')])
std_norm_light_test = std_scaler.fit_transform(x_test[:,columns.index('AMB_Lux')])


In [10]:
#transforming z axis accelerometer values to standard normal
std_norm_bump_train = std_scaler.fit_transform(x_train[:,columns.index('RDQ_AcZMea')])
std_norm_bump_valid = std_scaler.fit_transform(x_valid[:,columns.index('RDQ_AcZMea')])
std_norm_bump_test = std_scaler.fit_transform(x_test[:,columns.index('RDQ_AcZMea')])


In [11]:
# Dropping sound flag (1st column) 
x_sound_predictors_train = x_predictors_train[:,1:]
x_sound_predictors_test = x_predictors_test[:,1:]
x_sound_predictors_valid = x_predictors_valid[:,1:]

# Adding sound std normal values
x_sound_predictors_train = np.insert(x_sound_predictors_train, 0,std_norm_sound_train, axis = 1)
x_sound_predictors_test = np.insert(x_sound_predictors_test, 0,std_norm_sound_test, axis = 1)
x_sound_predictors_valid = np.insert(x_sound_predictors_valid, 0,std_norm_sound_valid, axis = 1)
#TODO : Include accelerator values on all axes 

### Training over a linear kernel

In [12]:
# Test train and validation with std norm variables instead of flag

x_std_predictors_train = np.array([std_norm_sound_train, std_norm_bump_train, std_norm_light_train]).T
x_std_predictors_test = np.array([std_norm_sound_test, std_norm_bump_test, std_norm_light_test]).T
x_std_predictors_valid = np.array([std_norm_sound_valid, std_norm_bump_valid, std_norm_light_valid]).T


In [15]:
clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=1.0, class_weight='balanced'))
clf.fit(x_std_predictors_train, y_train)

right=1.0*(clf.predict(x_std_predictors_test)==np.asarray(y_test)).sum()/len(y_test)

print "we successfully(OS) predict the {} percent of data using a linear kernel".format((right)*100)

os.system('say "Completed classification using standard normal variables"');

we successfully(OS) predict the 52.2129844724 percent of data using a linear kernel


## Tuning Parameters


In [14]:
'''
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

validate_clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_weighted' % 'recall')
validate_clf.fit(x_predictors_train, y_train)

validate_clf.best_params_

C = np.linspace(-5,50,10)
C=[math.exp(i) for i in C]
OS_validation=[]
for c in C:
    clf = svm.SVC(kernel='linear',C=c) 
    clf.fit(x_predictors_train, y_train)
    right=1.0*(clf.predict(x_predictors_valid)==np.asarray(y_valid)).sum()/len(y_valid)
    OS_validation.append(right)
    
temp=pd.DataFrame([C,OS_validation]).T
ind=len(temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()])/2 
C_opt=temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()].iloc[ind,0]


C=[math.log(y,10) for y in C]# for a better graph
pylab.plot(C,OS_validation,'b',)
pylab.legend(loc='upper right')
pylab.ylabel('Accuracy')
pylab.xlabel('log(C)')
pylab.show()

print("The optimal C we found is:{}".format(C_opt)) 
'''

'\ntuned_parameters = [{\'kernel\': [\'rbf\'], \'gamma\': [1e-3, 1e-4],\n                     \'C\': [1, 10, 100, 1000]},\n                    {\'kernel\': [\'linear\'], \'C\': [1, 10, 100, 1000]}]\n\nvalidate_clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5,\n                       scoring=\'%s_weighted\' % \'recall\')\nvalidate_clf.fit(x_predictors_train, y_train)\n\nvalidate_clf.best_params_\n\nC = np.linspace(-5,50,10)\nC=[math.exp(i) for i in C]\nOS_validation=[]\nfor c in C:\n    clf = svm.SVC(kernel=\'linear\',C=c) \n    clf.fit(x_predictors_train, y_train)\n    right=1.0*(clf.predict(x_predictors_valid)==np.asarray(y_valid)).sum()/len(y_valid)\n    OS_validation.append(right)\n    \ntemp=pd.DataFrame([C,OS_validation]).T\nind=len(temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()])/2 \nC_opt=temp.loc[temp.iloc[:,1]==temp.iloc[:,1].max()].iloc[ind,0]\n\n\nC=[math.log(y,10) for y in C]# for a better graph\npylab.plot(C,OS_validation,\'b\',)\npylab.legend(loc=\'upper right\')\np