In [40]:
from sklearn import svm, cross_validation, grid_search
import numpy as np
import scipy as sp
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
from sklearn.datasets import load_svmlight_file,  load_svmlight_files

In [42]:
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score

In [43]:
# データ読み込み
X_train, y_train, X_test, y_test = load_svmlight_files(['dataset_svm/train.dat', 'dataset_svm/test.dat'])

In [44]:
# 線形カーネル
linear_svc = svm.SVC(kernel='linear')
linear_svc.fit(X_train, y_train)
y_prediction_linear = linear_svc.predict(X_test)

In [45]:
print(classification_report(y_test, y_prediction_linear))
print(accuracy_score(y_test, y_prediction_linear))

             precision    recall  f1-score   support

       -1.0       0.99      0.96      0.98       300
        1.0       0.96      0.99      0.98       300

avg / total       0.98      0.98      0.98       600

0.976666666667


In [46]:
# 線形カーネル(C=10000)
linear_svc = svm.SVC(kernel='linear', C=10000)
linear_svc.fit(X_train, y_train)
y_prediction_linear = linear_svc.predict(X_test)

In [47]:
print(classification_report(y_test, y_prediction_linear))
print(accuracy_score(y_test, y_prediction_linear))

             precision    recall  f1-score   support

       -1.0       0.99      0.96      0.97       300
        1.0       0.96      0.99      0.97       300

avg / total       0.97      0.97      0.97       600

0.973333333333


In [48]:
# RBFカーネル
rbf_svc = svm.SVC(kernel='rbf')
rbf_svc.fit(X_train, y_train)
y_prediction_rbf = rbf_svc.predict(X_test)

In [49]:
print(classification_report(y_test, y_prediction_rbf))
print(accuracy_score(y_test, y_prediction_rbf))

             precision    recall  f1-score   support

       -1.0       0.99      0.38      0.55       300
        1.0       0.62      1.00      0.76       300

avg / total       0.80      0.69      0.66       600

0.69


In [50]:
# 多項式カーネル
rbf_poly = svm.SVC(kernel='poly')
rbf_poly.fit(X_train, y_train)
y_prediction_poly = rbf_poly.predict(X_test)

In [51]:
print(classification_report(y_test, y_prediction_poly))
print(accuracy_score(y_test, y_prediction_poly))

             precision    recall  f1-score   support

       -1.0       0.98      0.18      0.30       300
        1.0       0.55      1.00      0.71       300

avg / total       0.77      0.59      0.51       600

0.588333333333


In [52]:
# グリッドサーチ
parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':np.logspace(-4, 4, 9)}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, n_jobs = -1)

clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'kernel': ('linear', 'rbf', 'poly'), 'C': array([  1.00000e-04,   1.00000e-03,   1.00000e-02,   1.00000e-01,
         1.00000e+00,   1.00000e+01,   1.00000e+02,   1.00000e+03,
         1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [53]:
# ベストパラメタ
clf.best_estimator_

SVC(C=10000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [54]:
clf.grid_scores_

[mean: 0.61900, std: 0.08786, params: {'kernel': 'linear', 'C': 0.0001},
 mean: 0.61900, std: 0.08786, params: {'kernel': 'rbf', 'C': 0.0001},
 mean: 0.58050, std: 0.05714, params: {'kernel': 'poly', 'C': 0.0001},
 mean: 0.61900, std: 0.08786, params: {'kernel': 'linear', 'C': 0.001},
 mean: 0.61900, std: 0.08786, params: {'kernel': 'rbf', 'C': 0.001},
 mean: 0.55300, std: 0.03976, params: {'kernel': 'poly', 'C': 0.001},
 mean: 0.61900, std: 0.08786, params: {'kernel': 'linear', 'C': 0.01},
 mean: 0.61900, std: 0.08786, params: {'kernel': 'rbf', 'C': 0.01},
 mean: 0.55200, std: 0.03870, params: {'kernel': 'poly', 'C': 0.01},
 mean: 0.85000, std: 0.06874, params: {'kernel': 'linear', 'C': 0.10000000000000001},
 mean: 0.61900, std: 0.08786, params: {'kernel': 'rbf', 'C': 0.10000000000000001},
 mean: 0.55200, std: 0.03870, params: {'kernel': 'poly', 'C': 0.10000000000000001},
 mean: 0.95250, std: 0.01410, params: {'kernel': 'linear', 'C': 1.0},
 mean: 0.61900, std: 0.08786, params: {'kern

In [55]:
# 訓練データを利用したグリッドサーチの結果上位5件のカーネル、パラメタについてテストデータで検証
from operator import itemgetter
n_top=5
top_scores = sorted(clf.grid_scores_, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
    print("Model with rank: {0}".format(i + 1))
    print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
          score.mean_validation_score,
          np.std(score.cv_validation_scores)))
    print("Parameters: {0}".format(score.parameters))
    
    svc_test = svm.SVC(kernel=score.parameters["kernel"], C=score.parameters["C"])
    svc_test.fit(X_train, y_train)
    y_prediction = svc_test.predict(X_test)
    print(classification_report(y_test, y_prediction))
#     print("accuracy score:",accuracy_score(y_test, y_prediction))
    print("")
    print("")

Model with rank: 1
Mean validation score: 0.953 (std: 0.012)
Parameters: {'kernel': 'rbf', 'C': 10000.0}
             precision    recall  f1-score   support

       -1.0       0.99      0.96      0.97       300
        1.0       0.96      0.99      0.98       300

avg / total       0.98      0.97      0.97       600



Model with rank: 2
Mean validation score: 0.953 (std: 0.014)
Parameters: {'kernel': 'linear', 'C': 1.0}
             precision    recall  f1-score   support

       -1.0       0.99      0.96      0.98       300
        1.0       0.96      0.99      0.98       300

avg / total       0.98      0.98      0.98       600



Model with rank: 3
Mean validation score: 0.950 (std: 0.012)
Parameters: {'kernel': 'linear', 'C': 10.0}
             precision    recall  f1-score   support

       -1.0       0.99      0.96      0.97       300
        1.0       0.96      0.99      0.97       300

avg / total       0.97      0.97      0.97       600



Model with rank: 4
Mean validation 

In [56]:
# n_featuresを使った読み込み
X_train, y_train = load_svmlight_file("dataset_svm/train.dat")
X_test, y_test = load_svmlight_file("dataset_svm/test.dat", n_features=X_train.shape[1])
# 線形カーネル
linear_svc = svm.SVC(kernel='linear')
linear_svc.fit(X_train, y_train)
y_prediction_linear = linear_svc.predict(X_test)

print(classification_report(y_test, y_prediction_linear))
print(accuracy_score(y_test, y_prediction_linear))

             precision    recall  f1-score   support

       -1.0       0.99      0.96      0.98       300
        1.0       0.96      0.99      0.98       300

avg / total       0.98      0.98      0.98       600

0.976666666667
