In [103]:
from sklearn import svm
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [104]:
y_train = np.loadtxt('y_train1-1.csv', delimiter=',')
y_test = np.loadtxt('y_test1-1.csv', delimiter=',')
X_train = np.loadtxt('X_train_binary.csv', delimiter=',')
X_test = np.loadtxt('X_test_binary.csv', delimiter=',')

In [105]:
cnt_class = {}
for y in y_train:
    if y in cnt_class:
        cnt_class[y] += 1
    else:
        cnt_class[y] = 1
totalPoints = 150     # the number of total data points
for key in cnt_class:
    print(str(key) + " class frequency is " + '%.4f' % (cnt_class[key] / totalPoints))

1.0 class frequency is 0.5467
-1.0 class frequency is 0.4533


In [106]:
number_of_training = np.shape(X_train)[0] # 150 x 61
number_of_test = np.shape(X_test)[0]
print("number of training data: " + str(number_of_training))
print("number of test data: " + str(number_of_test))


number of training data: 150
number of test data: 164


In [107]:
# compute the mean and standard deviation of the training features
mean_of_training = np.mean(X_train, axis=0) 
standard_of_training = np.std(X_train, axis=0)
# to normalize data
norm_train = (X_train - mean_of_training) / standard_of_training
norm_test = (X_test - mean_of_training) / standard_of_training

In [108]:
# print("mean of original test data:")
# print(np.around(np.mean(X_test, axis=0),2))
# print("variance of original test data:")
# print(np.around(np.var(X_test, axis=0), 2))

In [116]:
print("Mean of normalized features in the test data:")
print(np.around(np.mean(norm_test, axis=0),2))
print("Variance of normalized features in the test data:")
print(np.around(np.var(norm_test, axis=0), 2))

Mean of normalized features in the test data:
[ 0.09  0.17 -0.06 -0.08 -0.04 -0.11 -0.1  -0.21  0.27  0.08  0.01  0.06
  0.01  0.    0.13  0.02  0.13  0.13  0.03  0.1   0.48  0.11  0.05 -0.12
  0.11  0.02 -0.1  -0.13 -0.18  0.01 -0.03  0.    0.2  -0.01 -0.08  0.17
  0.3   0.18  0.05 -0.02  0.08  0.22  0.04 -0.12 -0.03  0.1   0.12  0.1
 -0.07 -0.05 -0.13  0.04 -0.    0.01  0.23 -0.04  0.14  0.14  0.04 -0.01
 -0.06]
Variance of normalized features in the test data:
[ 1.93  7.28  0.79  0.74  0.86  0.98  1.07  2.88  2.97  1.48  1.09  1.14
  1.12  1.24  1.27  1.01  1.13  3.89  5.71  5.   54.28  1.44  1.02  0.97
  0.85  1.16  0.59  0.8   0.4   1.22  1.03  1.    1.03  1.07  0.82  4.91
 11.01  0.97  0.81  0.89  2.44  2.21  1.51  0.89  1.32  0.82  1.2   2.23
  1.22  0.93  1.19  1.31  1.39  0.86  1.94  1.03  1.07  1.21  1.74  1.87
  1.01]


In [119]:
# values of hyperparameters gamma and C
C = [10**n for n in range(-2, 5)]
gamma = [10**n for n in range(-4, 3)]

params = [{"kernel":["rbf"],
          "C":C, 
          "gamma": gamma}]

# 5-fold cross-validation using grid-search
grid = GridSearchCV(svm.SVC(), param_grid = params, cv=5, scoring='accuracy') 
grid.fit(norm_train, y_train)

best_C = grid.best_params_["C"]
best_gamma = grid.best_params_["gamma"]
print('the best C:',  str(best_C))  
print('the best gamma:',  str(best_gamma)) 

the best C: 1
the best gamma: 0.01


In [111]:
# train an SVM with the best hyperparameters using the complete training dataset
model = svm.SVC(kernel = 'rbf', C = best_C, gamma = best_gamma)
model.fit(norm_train, y_train)

# training error
training_predictions = model.predict(norm_train)
training_accurracy = accuracy_score(y_train, training_predictions)
print('training error:', 1 - training_accurracy)

# test error
test_predictions = model.predict(norm_test)
test_accurracy = accuracy_score(y_test, test_predictions)
print('test error:',1 - test_accurracy)

training error: 0.046666666666666634
test error: 0.20731707317073167


In [114]:
number_of_bounded = {} # the number of bounded SV and its corresponding value of C
number_of_free = {} # the number of free SV and its corresponding value of C

# number of C is drastically increased and decreased
# (the best of value of C is 1)
C = [0.001, 10000]

for c in C:
    model_c = svm.SVC(kernel='rbf', C=c, gamma=best_gamma)
    model_c.fit(norm_train, y_train)
    # get the set of alpha
    alphas = np.abs(model_c.dual_coef_)
    # alpha needs to large than zero
    valid_alphas = alphas[alphas > 0]
    # to get the number of bounded and free sv when C = c
    number_of_bounded_c = alphas[alphas == c].shape[0]
    number_of_free_c = alphas[alphas < c].shape[0]
    number_of_bounded[c] = number_of_bounded_c
    number_of_free[c] = number_of_free_c

print('number of bounded SV:', number_of_bounded)
print('number of free SV:', number_of_free)

number of bounded SV: {0.001: 136, 10000: 0}
number of free SV: {0.001: 0, 10000: 72}
