# search for the best parameters

In [2]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import numpy as np
import pandas as pd
import scipy.io as sio

In [6]:
mat = sio.loadmat('./data/ex6data3.mat')
print(mat.keys())

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y', 'yval', 'Xval'])


In [14]:
data_train = pd.DataFrame(mat['X'], columns=['x1','x2'])
data_train['y'] = mat['y']
data_train

Unnamed: 0,x1,x2,y
0,-0.158986,0.423977,1
1,-0.347926,0.470760,1
2,-0.504608,0.353801,1
3,-0.596774,0.114035,1
4,-0.518433,-0.172515,1
...,...,...,...
206,-0.399885,-0.621930,1
207,-0.124078,-0.126608,1
208,-0.316935,-0.228947,1
209,-0.294124,-0.134795,0


In [15]:
data_cv = pd.DataFrame(mat.get('Xval'), columns=['x1', 'x2'])
data_cv['y'] = mat.get('yval')
data_cv

Unnamed: 0,x1,x2,y
0,-0.353062,-0.673902,0
1,-0.227126,0.447320,1
2,0.092898,-0.753524,0
3,0.148243,-0.718473,0
4,-0.001512,0.162928,0
...,...,...,...
195,0.005203,-0.544449,1
196,0.176352,-0.572454,0
197,0.127651,-0.340938,0
198,0.248682,-0.497502,0


## 选择c 和 gamma

In [17]:
candidate = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
combination = [(C, gamma) for C in candidate for gamma in candidate]
len(combination)

81

In [18]:
combination_score = []

for c, gamma in combination:
    svm_train = svm.SVC(C= c, gamma= gamma)
    svm_train.fit(data_train[['x1','x2']],data_train['y'])
    combination_score.append(svm_train.score(data_cv[['x1','x2']],data_cv['y']))

In [21]:
best_index = np.argmax(combination_score)
combination_score[best_index]

0.965

In [22]:
combination[best_index]

(0.3, 100)

In [23]:
a = [1,2,3]
b = [4,5,6]
[(C, gamma) for C in a for gamma in b]

[(1, 4), (1, 5), (1, 6), (2, 4), (2, 5), (2, 6), (3, 4), (3, 5), (3, 6)]

In [26]:
svm_train = svm.SVC(C= 0.3, gamma= 100)
svm_train.fit(data_train[['x1','x2']],data_train['y'])
svm_train.score(data_cv[['x1','x2']],data_cv['y'])
y_predict = svm_train.predict(data_cv[['x1','x2']])
print(metrics.classification_report(data_cv['y'], y_predict))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97       113
           1       0.95      0.97      0.96        87

    accuracy                           0.96       200
   macro avg       0.96      0.97      0.96       200
weighted avg       0.97      0.96      0.97       200



## 网格搜索

In [33]:
data = pd.concat([data_train, data_cv], axis= 0)

In [41]:
test_fold = np.zeros(data.shape[0]) 
test_fold[:data_train.shape[0]] = -1            # 将训练集对应的index设为-1，表示永远不划分到验证集中

In [42]:
from sklearn.model_selection import PredefinedSplit
cv = PredefinedSplit(test_fold=test_fold)

In [44]:
parameters = {'C': candidate, 'gamma': candidate}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, n_jobs=-1,cv=cv)
clf.fit(data[['x1', 'x2']], data['y'])

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                         'gamma': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [45]:
clf.best_params_

{'C': 0.3, 'gamma': 100}