In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

data = fetch_openml('mnist_784', version=1, parser="auto")#Get data from https://www.openml.org/d/554
dfData = pd.DataFrame(np.c_[data["data"],data["target"]],columns = data["feature_names"]+["target"])

In [2]:
#creating a pipeline: putting the data into shape that the model would expect
img_pipeline = Pipeline([("mm_scaler", MinMaxScaler())])
y = dfData["target"]
dfData = dfData.drop("target",axis=1)
X = dfData.copy()
X_transf = img_pipeline.fit_transform(X)

In [3]:
#split our set into training set and test set
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in stratSplit.split(X_transf, y):
    X_train = X_transf[train_index]
    X_test = X_transf[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

In [4]:
#test set (use of it): make changes and see how it affects the outcome (make improvements)
#see how the data would perform on the test set
log_reg = LogisticRegression(C=1e5, max_iter=100)
log_reg.fit(X_train, y_train)
print("Test set performance:",log_reg.score(X_test, y_test))

Test set performance: 0.9247857142857143


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
#Once you have your final model it can be useful to to re-train it on the full dataset (using the pipeline)
log_reg.fit(X_transf, y)
log_reg.score(X_transf, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9347

## Saving your model

In [8]:
# Option 1 joblib: will appear in the same file as our notebook
import joblib
joblibModelName = 'ultimateMNISTClassifierJoblibSave.pkl' #choose your name

#dump our final trained model into our file name using a pickle extension (.pkl)
joblib.dump(log_reg, joblibModelName)

['ultimateMNISTClassifierJoblibSave.pkl']

In [9]:
# Option 2 Pickle: will appear in the same file as our notebook
import pickle
pickleModelName = 'ultimateMNISTClassifierPickleSave.pkl' #choose your name

#with pickle we have to write "with open" and "wb" (write binary)
with open(pickleModelName, "wb") as out:
    pickle.dump(log_reg, out) #dumping the final model into the output file

In [None]:
# If you did grid search remember you can access the best estimator by usinng .best_estimator_

## Saving your pipeline

In [10]:
#same thing for the fitted pipeline (as done before on the model)
import joblib
pipelineName = 'ultimateMNISTClassifierPipeline.pkl' #choose your name
joblib.dump(img_pipeline, pipelineName)

['ultimateMNISTClassifierPipeline.pkl']

## Loading Pipeline

In [11]:
pipelineName = 'ultimateMNISTClassifierPipeline.pkl'
loaded_pipeline = joblib.load(pipelineName) #load: load our file
X_trans_loaded = loaded_pipeline.transform(X) #it's already fitted so just use .transform

## Loading Model

In [12]:
# Option 1 joblib
modelName = 'ultimateMNISTClassifierJoblibSave.pkl'
loaded_log_clf = joblib.load(modelName) #load our file
loaded_log_clf.score(X_trans_loaded, y) #now we can .score the model in relation to our pipeline X_trans_loaded

0.9347

In [13]:
# Option 2 pickle
modelName = 'ultimateMNISTClassifierPickleSave.pkl'
with open(modelName, 'rb') as inModel:
    loaded_log_clf = pickle.load(inModel) #now using pickle instead of joblib
loaded_log_clf.score(X_trans_loaded, y) #now we can .score the model in relation to our pipeline X_trans_loaded

0.9347