In [80]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score

## Load data

In [2]:
x_train = np.load("x_train.npy")
y_train = np.load("y_train.npy")
x_test = np.load("x_test.npy")
y_test = np.load("y_test.npy")

In [3]:
# 550 data with 300 features
print(x_train.shape)

(550, 300)


In [4]:
# It's a binary classification problem 
print(np.unique(y_train))

[0 1]


## Question 1
K-fold data partition: Implement the K-fold cross-validation function. Your function should take K as an argument and return a list of lists (len(list) should equal to K), which contains K elements. Each element is a list contains two parts, the first part contains the index of all training folds, e.g. Fold 2 to Fold 5 in split 1. The second part contains the index of validation fold, e.g. Fold 1 in  split 1

In [81]:
def cross_validation(x_train, y_train, k=5):
    indices = np.arange(len(x_train))
    np.random.seed()
    np.random.shuffle(indices)
    folds = np.array_split(indices, k)
    folds = np.array(folds)

    k_fold = []
    for i in range(k):
        train_fold = np.delete(np.arange(k), i)
        k_fold.append([np.concatenate((folds[train_fold]), axis=None), folds[i]])
    return k_fold

In [86]:
kfold_data = cross_validation(x_train, y_train, k=10)
assert len(kfold_data) == 10 # should contain 10 fold of data
assert len(kfold_data[0]) == 2 # each element should contain train fold and validation fold
assert kfold_data[0][1].shape[0] == 55 # The number of data in each validation fold should equal to training data divieded by K

## example

In [56]:
from sklearn.model_selection import KFold

X = np.arange(20)
kf = KFold(n_splits=5, shuffle=True)
kfold_data= []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    print("Split: %s, Training index: %s, Validation index: %s" % (i+1, train_index, val_index))
    kfold_data.append([train_index, val_index])

Split: 1, Training index: [ 1  2  4  5  6  7  8  9 10 11 12 13 15 16 18 19], Validation index: [ 0  3 14 17]
Split: 2, Training index: [ 0  1  3  4  5  6  7  8  9 12 13 14 15 16 17 18], Validation index: [ 2 10 11 19]
Split: 3, Training index: [ 0  2  3  4  5  6  7  8 10 11 14 15 16 17 18 19], Validation index: [ 1  9 12 13]
Split: 4, Training index: [ 0  1  2  3  4  7  9 10 11 12 13 14 16 17 18 19], Validation index: [ 5  6  8 15]
Split: 5, Training index: [ 0  1  2  3  5  6  8  9 10 11 12 13 14 15 17 19], Validation index: [ 4  7 16 18]


In [57]:
assert len(kfold_data) == 5 # should contain 5 fold of data
assert len(kfold_data[0]) == 2 # each element should contains index of training fold and validation fold
assert kfold_data[0][1].shape[0] == 4 # The number of data in each validation fold should equal to training data divieded by K

## Question 2
Using sklearn.svm.SVC to train a classifier on the provided train set and conduct the grid search of “C” and “gamma” to find the best parameters by cross-validation.

In [31]:
clf = SVC(C=1.0, kernel='rbf', gamma=0.01)

In [126]:
## your code
# C = 2, gamma = 0.001
kfold_data = cross_validation(x_train, y_train, k=5)

C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gamma = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

average_scores = np.zeros((len(C), len(gamma)))
best_score = 0
best_model = None
best_hyperparameter = [0, 0]

for i in range(len(C)):
    for j in range(len(gamma)):
        score = 0

        for traing_idx, validation_idx in kfold_data:
            clf = SVC(C=C[i], kernel='rbf', gamma=gamma[j])
            clf.fit(x_train[traing_idx], y_train[traing_idx])
            score += clf.score(x_train[validation_idx], y_train[validation_idx])

        average_scores[i][j] = score / len(kfold_data)
        if average_scores[i][j] > best_score:
            best_score = average_scores[i][j]
            best_model = clf
            best_hyperparameter = [C[i], gamma[j]]

In [127]:
print(best_hyperparameter, best_score)

[10, 0.001] 0.8854545454545455


In [128]:
best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)
print("Accuracy score: ", accuracy_score(y_pred, y_test))

Accuracy score:  0.8958333333333334


## Question 3
Plot the grid search results of your SVM. The x, y represents the hyperparameters of “gamma” and “C”, respectively. And the color represents the average score of validation folds
You reults should be look like this reference image below ![image](https://miro.medium.com/max/1296/1*wGWTup9r4cVytB5MOnsjdQ.png) 

## Question 4
Train your SVM model by the best parameters you found from question 2 on the whole training set and evaluate the performance on the test set.

In [None]:
y_pred = best_model.predict(x_test)
print("Accuracy score: ", accuracy_score(y_pred, y_test))