# Линейный SVM

In [2]:
# Common imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10, 5)

В sklearn SVM классификатор можно построить тремя способами:

| Class | Временная сложность | Требуется масштабирование | Поддержка out-of-core | Поддержка kernel trick |
|---------------|------------------------------|---------------------------|-----------------------|------------------------|
| LinearSVC | O(m x n) | Да | Нет | Нет |
| SGDClassifier | O(m x n) | Да | Да | Нет |
| SVC | O($m^2$ x n) до O($m^3$ x n) | Да | Нет | Да |

* m - количество экземпляров
* n - количество фич
* LinearSVC использует оптимизированную для линейной задачи версию SVM (liblinear)
* SGDClassifier - SGD с hindge loss-функцией
* SVC использует libsvm, который поддерживает kernel trick

In [3]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = (iris["target"] == 2).astype(np.float64)  # Iris-Virginica

In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC

In [5]:
C = 1
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=C, loss="hinge", random_state=42)),
])

In [6]:
%%time
cross_val_score(svm_clf, X, y, scoring='f1', cv=ShuffleSplit(n_splits=4, test_size=0.2, random_state=42))

CPU times: user 12.1 ms, sys: 1.73 ms, total: 13.8 ms
Wall time: 12.4 ms


array([1.        , 0.95238095, 0.94736842, 0.92857143])

In [7]:
alpha = 1 / (X.shape[0] * C)  # 1 / (m*C)
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", SGDClassifier(loss="hinge", alpha=alpha, random_state=42)),
])

In [9]:
%%time
cross_val_score(svm_clf, X, y, scoring='f1', cv=ShuffleSplit(n_splits=4, test_size=0.2, random_state=42))

CPU times: user 14.2 ms, sys: 2.65 ms, total: 16.9 ms
Wall time: 14.6 ms




array([1.        , 0.95238095, 1.        , 0.92857143])

In [8]:
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", SVC(C, kernel='linear', random_state=42)),
])

In [9]:
%%time
cross_val_score(svm_clf, X, y, scoring='f1', cv=ShuffleSplit(n_splits=4, test_size=0.2, random_state=42))

CPU times: user 13.2 ms, sys: 2.6 ms, total: 15.8 ms
Wall time: 14.1 ms


array([ 1.        ,  0.95238095,  0.94736842,  0.92857143])