# Support Vector Machines

In [None]:
import math
import pandas as pd
import numpy as np
import pylab as pl
import sklearn
import operator
import scipy as sp

from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import SVC
from sklearn import feature_selection
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

%matplotlib inline  


## Data

Use the same data as the KNN notebook. Read and explore the data

In [None]:
data=pd.read_csv('german.csv',header=None)
data.shape

In [None]:
data.describe()

In [None]:
# Split to train and test data
train, test = train_test_split(data, train_size = 0.7)

train_X = np.array(train)[:, :24]
train_Y = np.array(train)[:,24]

test_X = np.array(test)[:, :24]
test_Y = np.array(test)[:,24]


We can redefine the accuracy measure:

In [None]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        #If the label of the testSet and the prediction are the same add one.
        if testSet[i] == predictions[i]:
            correct += 1
    return (float(correct)/float(len(testSet))) * 100.0

## Linear SVM

Let's start with a linear kernel

In [None]:
# Build the linear model
clf = svm.SVC(kernel='linear')

# Train it with the training data
clf.fit(train_X, train_Y)

# Predict the new unseen test dataset
predictions = clf.predict(test_X)

# Calculate the overall accuracy of correctly classified instances
accuracy = getAccuracy(test_Y, predictions)
print('Accuracy: ', accuracy, '%')

What happens if we auto-scale (standardise) our data?

In [None]:
# We start by constructing our scaler 
scaler  = preprocessing.StandardScaler().fit(train_X.astype(float))
train_Xscaled = scaler.transform(train_X.astype(float))

# The scaler instance is used on our test data to transform it the same way it did on the training set:
test_Xscaled = scaler.transform(test_X.astype(float))

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(train_Xscaled, train_Y)

predictions = clf.predict(test_Xscaled)

accuracy = getAccuracy(test_Y, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

## RBF Kernel

Let's do the same steps with the rbf kernel.

We begin once more with the raw data

In [None]:
# Default values for C and gamma. 

clf = svm.SVC(kernel='rbf', C=1.0, gamma=0.0)
clf.fit(train_X, train_Y)

predictions = clf.predict(test_X)

accuracy = getAccuracy(test_Y, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

while with scaled data:

In [None]:
clf = svm.SVC(kernel='rbf', C=1.0, gamma=0.0)
clf.fit(train_Xscaled, train_Y)

predictions = clf.predict(test_Xscaled)

accuracy = getAccuracy(test_Y, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

## Tuning the RBF kernel hyperparameters

What if now we optimise the hyperparameters using a coarse tuning? (this may take a bit of time to run)

In [None]:
# gamma and Cost allowed values
gamma_range = 2. ** np.arange(-15, 5, step=2)
C_range     = 2. ** np.arange(-5, 15, step=2)

param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=KFold(n=train_Xscaled.shape[0], n_folds=5))

grid.fit(train_Xscaled, train_Y)

print("The best parameters are: gamma=", np.log2(grid.best_params_['gamma']), 
      " and Cost=", np.log2(grid.best_params_['C']))
bestG = np.log2(grid.best_params_['gamma']);
bestC = np.log2(grid.best_params_['C']);

# plot the scores of the grid
# grid_scores_ contains parameter settings and scores
score_dict = grid.grid_scores_

# We extract just the scores
scores = [x[1] for x in score_dict]
scores = np.array(scores).reshape(len(C_range), len(gamma_range))


# Make a heatmap with the performance
pl.figure(figsize=(10, 6))
pl.subplots_adjust(left=0.15, right=0.95, bottom=0.15, top=0.95)
v = np.linspace(0.8, 1.0, 15, endpoint=True)
pl.imshow(scores, interpolation='nearest', origin='higher', cmap=pl.cm.get_cmap('jet_r'))
pl.xlabel('gamma (log2)')
pl.ylabel('Cost (log2)')
cbar = pl.colorbar()
pl.xticks(np.arange(len(gamma_range)), np.log2(gamma_range))
pl.yticks(np.arange(len(C_range)), np.log2(C_range))
pl.show()

Let's use the selected parameters

In [None]:
clf = svm.SVC(kernel='rbf', C=grid.best_params_['C'], gamma=grid.best_params_['gamma'])
clf.fit(train_Xscaled, train_Y)

predictions = clf.predict(test_Xscaled) 

accuracy = getAccuracy(test_Y, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

Which kernel gave the best accuracy? In which case?

You can repeat the parameters tuning with other kernels.
