In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
from scipy import ndimage

from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import PCA

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=1/7.0, random_state=0)

In [4]:
# Loại bỏ các điểm ảnh ở biên (giá trị 0)
def crop(data):
    data = data.reshape(28,28)
    r = data[~np.all(data == 0, axis=1)] # loại bỏ hàng toàn 0
    idx = np.argwhere(np.all(r[..., :] == 0, axis=0)) # vị trí cột toàn 0
    c = np.delete(r, idx, axis=1) # loại bỏ cột toàn 0
    res = cv2.resize(c, dsize=(28, 28)) # Trả lại về size 28x28
    res = res.flatten() # Chuyển lại thành array 1 chiều như ban đầu
    return res

In [5]:
def cropData(data):
    return np.apply_along_axis(crop, 1, data)

In [6]:
preprocess_pipeline = make_pipeline(FunctionTransformer(cropData),
                                    StandardScaler())

In [7]:
X_train_crop = preprocess_pipeline.fit_transform(X_train)
X_test_crop = preprocess_pipeline.fit_transform(X_test)

In [8]:
start = time.time()
logreg_crop_model = LogisticRegression(solver='lbfgs',max_iter= 5000).fit(X_train_crop, y_train)
end = time.time()

print('Time to fit model: {}s'.format(end-start))

Time to fit model: 669.5262906551361s


In [9]:
# save the model to disk
import pickle
filename = 'final_model.sav'
pickle.dump(logreg_crop_model, open(filename, 'wb'))

In [10]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

preds_train_crop = loaded_model.predict(X_train_crop)
preds_test_crop = loaded_model.predict(X_test_crop)

# Evaluation
print('Train Error: {} %'.format(np.mean(preds_train_crop != y_train)*100))
print('Test Error: {} %\n'.format(np.mean(preds_test_crop != y_test)*100))

Train Error: 4.35 %
Test Error: 6.78 %

