## Important Points

* Used MNIST dataset ( takes around 6 hours to run RandomizedSearchCV with cv=3 and 5 iterations)
* Training accuracy: 99.90%
* Validation accuracy: 97%

In [1]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

np.random.seed(42)
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_openml
data = fetch_openml('mnist_784', version=1, cache=True)

x = data.data 
y = data.target

In [3]:
rand_ind = np.random.permutation(x.shape[0])

x_train = x[:60000]
y_train = y[:60000]

x_valid = x[60000:]
y_valid = y[60000:]

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_valid = scaler.transform(x_valid)

In [5]:
def score(y_true, y_pred, train=False):
    accuracy = accuracy_score(y_true, y_pred) 
    if train:
        print("Train accuracy:{}".format(accuracy))
    else:
        print("Val accuracy:{}".format(accuracy))

In [None]:
from sklearn.svm import LinearSVC
# LinearSVC by default uses OVR
lin_clf = LinearSVC(loss='hinge', random_state=42)

In [None]:
lin_clf.fit(x_train, y_train)

y_pred = lin_clf.predict(x_valid)
score(y_valid, y_pred)

score(y_train, lin_clf.predict(x_train), True)

In [6]:
from sklearn.svm import SVC

svc = SVC(random_state=42, kernel='rbf')

In [None]:
svc.fit(x_train, y_train)

score(y_train, svc.predict(x_train), True)
score(y_valid, svc.predict(x_valid))

In [7]:
%timeit
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

params = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rand_search = RandomizedSearchCV(SVC(), params, 
                                 n_iter = 5, verbose=1, cv=3, n_jobs=-1)
rand_search.fit(x_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 132.6min finished


RandomizedSearchCV(cv=3, estimator=SVC(), n_iter=5, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f845a0bba60>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f845a0bb700>},
                   verbose=1)

In [8]:
rand_search.best_estimator_

SVC(C=6.630025243684105, gamma=0.0015915925174727911)

In [9]:
y_pred = rand_search.best_estimator_.predict(x_train)
score(y_train, y_pred, True)

Train accuracy:0.9990666666666667


In [None]:
y_pred = rand_search.best_estimator_.predict(x_valid)
score(y_valid, y_pred)