# SVM
+ 复杂度问题(Complexity)
+ 奇异值检测(Density estimation, novelty detection)
> One-class SVM is used for novelty detection, that is, given a set of samples, it will detect the soft boundary of that set so as to classify new points as belonging to that set or not. The class that implements this is called OneClassSVM.

In [1]:
%matplotlib inline

## 1 简单应用
### 1.1 了解数据（scikit-learn 自带）

In [2]:
from sklearn import svm, datasets
iris = datasets.load_iris()

type(iris)
# help(iris)
print(iris.keys())
print(type(iris.data),iris.data.shape)
print(iris.data[:2,:3])
print(type(iris.target),iris.target)
print(type(iris.target_names),iris.target_names)
print(type(iris.feature_names),iris.feature_names)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
<class 'numpy.ndarray'> (150, 4)
[[ 5.1  3.5  1.4]
 [ 4.9  3.   1.4]]
<class 'numpy.ndarray'> [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
<class 'numpy.ndarray'> ['setosa' 'versicolor' 'virginica']
<class 'list'> ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


### 1.2 分割数据

In [3]:
from sklearn.model_selection import train_test_split

# 分割训练集(0.6)-测试集(0.4)
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(90, 4) (90,)
(60, 4) (60,)


### 1.3 分类
+ 处理多分类问题时，有三类分类器：<font color=#0099ff size=3 face="黑体">SVC、NuSVC、LinearSVC</font>
  - SVC and NuSVC implement the “one-against-one” approach (Knerr et al., 1990) for multi- class classification.
  - LinearSVC implements “one-vs-the-rest” multi-class strategy, thus training n_class models. 

In [4]:
from sklearn import svm
import pandas as pd

clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
print(clf)
print(clf.score(X_test, y_test)) 
predict = clf.predict(X_test)  # 分类预测，分类结果
print(type(predict), predict.shape)
print(predict[:5])
print()

nu_clf = svm.NuSVC().fit(X_train, y_train)
print(nu_clf)
print(nu_clf.score(X_test, y_test)) 
print()

lin_clf = svm.LinearSVC().fit(X_train, y_train)
print(lin_clf)
print(lin_clf.score(X_test, y_test)) 
# pd.DataFrame(predict[:15]).plot(kind = 'bar')

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.966666666667
<class 'numpy.ndarray'> (60,)
[2 1 0 2 0]

NuSVC(cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, nu=0.5, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False)
0.933333333333

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.916666666667


In [5]:
print(clf.support_vectors_)  # get support vectors
print(clf.support_ ) # get indices of support vectors
print(clf.n_support_)  # get number of support vectors for each class

[[ 4.5  2.3  1.3  0.3]
 [ 5.1  3.3  1.7  0.5]
 [ 6.   3.4  4.5  1.6]
 [ 6.1  3.   4.6  1.4]
 [ 6.9  3.1  4.9  1.5]
 [ 5.1  2.5  3.   1.1]
 [ 6.2  2.2  4.5  1.5]
 [ 6.3  2.5  4.9  1.5]
 [ 6.7  3.   5.   1.7]
 [ 5.9  3.2  4.8  1.8]
 [ 5.8  2.7  5.1  1.9]
 [ 6.4  2.7  5.3  1.9]
 [ 5.9  3.   5.1  1.8]
 [ 6.5  3.2  5.1  2. ]
 [ 7.2  3.   5.8  1.6]
 [ 6.   3.   4.8  1.8]
 [ 5.8  2.7  5.1  1.9]]
[31 51  0  6 14 17 19 72 73 81  2 10 16 29 38 59 77]
[2 8 7]


### 1.4 回归
+ There are three different implementations of Support Vector Regression: SVR, NuSVR and LinearSVR

In [6]:
from sklearn import svm

rlf = svm.SVR().fit(X_train, y_train)
print(rlf)
print(rlf.score(X_test, y_test)) 
predict = rlf.predict(X_test) # 回归预测，回归结果
print(type(predict), predict.shape)
print(predict[:5])
# pd.DataFrame(predict[:15]).plot(kind = 'bar')

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
0.930345125072
<class 'numpy.ndarray'> (60,)
[ 2.07032407  0.87624739  0.04936913  2.04760139  0.01072546]


## 2 复杂问题
### 2.1 样本不均衡问题
+ In problems where it is desired to give more importance to certain classes or certain individual samples keywords **class_weight** and **sample_weight** can be used.
+ the size of points is proportional to its weight.
> The sample weighting rescales the C parameter, which means that the classifier puts more emphasis on getting these points right. The effect might often be subtle. To emphasize the effect here, we particularly weight outliers, making the deformation of the decision boundary very visible.

In [7]:
clf_weights = svm.SVC()
clf_weights.fit(X, y, sample_weight=sample_weight_last_ten)

clf_no_weights = svm.SVC()
clf_no_weights.fit(X, y)

NameError: name 'X' is not defined

### 2.2 核函数

### 2.3 多个模型

In [8]:
C = 1.0  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(X_train, y_train) for clf in models)
print(type(models))
for clf in models:
    print(clf)

<class 'generator'>
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.7, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


### 2.4 交叉验证

### 2.5 参数寻优

In [9]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(iris.data, iris.target)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [12]:
print(dir(clf))
sorted(clf.cv_results_.keys())

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_check_is_fitted', '_estimator_type', '_fit', '_get_param_names', 'best_estimator_', 'best_index_', 'best_params_', 'best_score_', 'cv', 'cv_results_', 'decision_function', 'error_score', 'estimator', 'fit', 'fit_params', 'get_params', 'grid_scores_', 'iid', 'inverse_transform', 'n_jobs', 'n_splits_', 'param_grid', 'pre_dispatch', 'predict', 'predict_log_proba', 'predict_proba', 'refit', 'return_train_score', 'score', 'scorer_', 'scoring', 'set_params', 'transform', 'verbose']


['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_C',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

### 2.6 交叉验证+参数寻优
> Parameter estimation using grid search with cross-validation
+ This examples shows how a classifier is optimized by cross-validation, which is done using the :class:`sklearn.model_selection.GridSearchCV` objecton a development set that comprises only half of the available labeled data.
+ The performance of the selected hyper-parameters and trained model is then measured on a dedicated evaluation set that was not used during the model selection step.
+ More details on tools available for model selection can be found in the sections on `cross_validation` and `grid_search`.

In [13]:
from __future__ import print_function

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

print(__doc__)

digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality.

Automatically created module for IPython interactive environment
# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on development set:

0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.959 (+/-0.029) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.025) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.025) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.975 (+/-0.014) for {'C': 1, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 10, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 100, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model 