In [1]:
# Import the necessary libs.

import pandas as pd
import numpy as np

# Load the mnist data set.

data_mnist = pd.read_csv("mnist.csv")

In [8]:
# Section off our practice, train, and test data.

col_x = data_mnist[data_mnist.columns[:-1]].to_numpy()
col_y = data_mnist[data_mnist.columns[-1]].to_numpy()

# Set a default training size.

size_prac = 1000
size_train = 60000

# These values are for quick testing and evaluation.

prac_x = col_x[:size_prac]
prac_y = col_y[:size_prac]

prac_5train = (prac_y == 5)

# These values are for training and testing the full model.

train_x = col_x[:size_train]
train_y = col_y[:size_train]

test_x = col_x[size_train:]
test_y = col_y[size_train:]

train_5 = (train_y == 5)
test_5 = (test_y == 5)

In [3]:
# Evaluate and score our model based on y and y_hat.

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def print_eval(y, y_hat):
    score_pre = precision_score(y, y_hat)
    score_rec = recall_score(y, y_hat)
    score_f1 = f1_score(y, y_hat)
    return (score_pre, score_rec, score_f1)

In [4]:
# Load the model we currently want to test.

from sklearn.svm import SVC as Model

# Import preprocessors and a pipeline.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer

# Import crossvalidation prediction.

from sklearn.model_selection import cross_val_predict

In [5]:
# Check all SVCs and look for the best score.

model_kernels = ["linear", "poly", "rbf", "sigmoid"]
model_variance = [0.01, 0.1, 1.0, 10.0, 100.0]

# Ignore warnings so they don't spam.

import warnings
warnings.filterwarnings("ignore")

# Test our models and train them.

for model_k in model_kernels:
    for model_c in model_variance:
        for model_g in model_variance:
            
            # Setup our pipeline we want to test.
            model_pipe = Pipeline([
                ("Scaler", StandardScaler()),
                ("Binary", Binarizer()),
                ("Model", Model(kernel=model_k, C=model_c, gamma=model_g)),
            ])
            
            # Predict and evaluate.
            y_hat = cross_val_predict(model_pipe, prac_x, prac_5train)
            score_eval = print_eval(prac_5train, y_hat)
            
            # Some values break the model. Ignore the broken models.
            if score_eval == (0.0, 0.0, 0.0):
                pass
            else:
                print(model_k, model_c, model_g)
                print(score_eval)
                print()

linear 0.01 0.01
(0.9, 0.29347826086956524, 0.4426229508196721)

linear 0.01 0.1
(0.9, 0.29347826086956524, 0.4426229508196721)

linear 0.01 1.0
(0.9, 0.29347826086956524, 0.4426229508196721)

linear 0.01 10.0
(0.9, 0.29347826086956524, 0.4426229508196721)

linear 0.01 100.0
(0.9, 0.29347826086956524, 0.4426229508196721)

linear 0.1 0.01
(0.76, 0.6195652173913043, 0.6826347305389221)

linear 0.1 0.1
(0.76, 0.6195652173913043, 0.6826347305389221)

linear 0.1 1.0
(0.76, 0.6195652173913043, 0.6826347305389221)

linear 0.1 10.0
(0.76, 0.6195652173913043, 0.6826347305389221)

linear 0.1 100.0
(0.76, 0.6195652173913043, 0.6826347305389221)

linear 1.0 0.01
(0.6702127659574468, 0.6847826086956522, 0.6774193548387099)

linear 1.0 0.1
(0.6702127659574468, 0.6847826086956522, 0.6774193548387099)

linear 1.0 1.0
(0.6702127659574468, 0.6847826086956522, 0.6774193548387099)

linear 1.0 10.0
(0.6702127659574468, 0.6847826086956522, 0.6774193548387099)

linear 1.0 100.0
(0.6702127659574468, 0.6847826

In [7]:
# Now we pick the best values for the best model we found and make a pipeline.
model_pipe = Pipeline([
    ("Scaler", StandardScaler()),
    ("Binary", Binarizer()),
    ("Model", Model(kernel="rbf", C=10.0, gamma=0.01)),
])

# Now we actually train the model.
model_pipe.fit(train_x, train_5)

Pipeline(steps=[('Scaler', StandardScaler()), ('Binary', Binarizer()),
                ('Model', SVC(C=10.0, gamma=0.01))])

In [9]:
# Now we need to save the model.
import joblib

joblib.dump(model_pipe, "model_pipe.pkl")

['model_pipe.pkl']

In [12]:
# Evaluate our model on the test data.

y_hat = model_pipe.predict(test_x)
print_eval(test_5, y_hat)

(0.9908256880733946, 0.968609865470852, 0.979591836734694)