In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import joblib

# catch data

In [2]:
data = pd.read_csv('./chinese_mnist.csv',encoding='utf-8')
data.shape[0], data.shape[1]

(15000, 5)

In [3]:
# property of data
print(data.head(20))
print(type(np.array(data)))
print(type(data))

    suite_id  sample_id  code      value character
0          1          1     1          0         零
1          1          1     2          1         一
2          1          1     3          2         二
3          1          1     4          3         三
4          1          1     5          4         四
5          1          1     6          5         五
6          1          1     7          6         六
7          1          1     8          7         七
8          1          1     9          8         八
9          1          1    10          9         九
10         1          1    11         10         十
11         1          1    12        100         百
12         1          1    13       1000         千
13         1          1    14      10000         万
14         1          1    15  100000000         亿
15         1          2     1          0         零
16         1          2     2          1         一
17         1          2     3          2         二
18         1          2     4  

In [4]:
image = []
start = time.time()
for i in range(100):
    for j in range(10):
        for k in range(15):
            path = './data/input_{}_{}_{}.jpg'.format(i+1,j+1,k+1)
            image.append(plt.imread(path))
end = time.time()
print('running time {}'.format(end-start))

running time 6.768014192581177


In [5]:
# property of image
image = np.array(image)
print(image.shape)
print(type(image))

(15000, 64, 64)
<class 'numpy.ndarray'>


# data preprocessing

In [6]:
y = data['code']
x = image.reshape([15000,-1])

In [7]:
# property
print(y.shape)
print(x.shape)

(15000,)
(15000, 4096)


In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 19)

# Feature engineering

In [9]:
n_components = 100
pca = PCA(n_components=n_components).fit(x_train)

In [10]:
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

In [11]:
# property
print(x_train_pca.shape)

(11250, 100)


# train model

In [12]:
model = SVC(C=3,kernel='rbf')
%time model.fit(x_train_pca,y_train)

Wall time: 42.9 s


SVC(C=3)

In [13]:
y_pre = model.predict(x_test_pca)

In [15]:
print(classification_report(y_test,y_pre))

              precision    recall  f1-score   support

           1       0.95      0.94      0.95       235
           2       0.86      0.99      0.92       253
           3       0.78      0.78      0.78       249
           4       0.81      0.74      0.78       255
           5       0.91      0.90      0.90       257
           6       0.92      0.88      0.90       260
           7       0.73      0.88      0.80       254
           8       0.84      0.84      0.84       221
           9       0.90      0.94      0.92       255
          10       0.87      0.79      0.83       277
          11       0.82      0.83      0.82       265
          12       0.85      0.81      0.83       241
          13       0.83      0.73      0.77       267
          14       0.82      0.79      0.80       247
          15       0.85      0.92      0.88       214

    accuracy                           0.85      3750
   macro avg       0.85      0.85      0.85      3750
weighted avg       0.85   

# Optimiser


In [19]:
param_grid = {'C':[1,3,5,7], 'kernel':['rbf']}
grid = GridSearchCV(SVC(),param_grid=param_grid,cv=3)

In [20]:
%time grid.fit(x_train_pca,y_train)

Wall time: 6min 30s


GridSearchCV(cv=3, estimator=SVC(),
             param_grid={'C': [1, 3, 5, 7], 'kernel': ['rbf']})

In [28]:
model_svc = grid.best_estimator_

In [27]:
# save model
joblib.dump(grid.best_estimator_, 'SVC_model.pkl')

['SVC_model.pkl']

In [29]:
y_pre_opt = model_svc.predict(x_test_pca)

In [31]:
print(classification_report(y_test,y_pre_opt))

              precision    recall  f1-score   support

           1       0.94      0.94      0.94       235
           2       0.89      0.99      0.94       253
           3       0.78      0.82      0.80       249
           4       0.86      0.75      0.80       255
           5       0.90      0.91      0.90       257
           6       0.92      0.88      0.90       260
           7       0.76      0.88      0.81       254
           8       0.84      0.84      0.84       221
           9       0.91      0.95      0.93       255
          10       0.87      0.80      0.83       277
          11       0.83      0.84      0.84       265
          12       0.85      0.80      0.83       241
          13       0.84      0.72      0.78       267
          14       0.80      0.79      0.80       247
          15       0.86      0.93      0.89       214

    accuracy                           0.85      3750
   macro avg       0.86      0.86      0.85      3750
weighted avg       0.86   

In [34]:
estimator = joblib.load('./SVC_model.pkl')

In [35]:
y_pre_opt = estimator.predict(x_test_pca)
print(classification_report(y_test,y_pre_opt))

              precision    recall  f1-score   support

           1       0.94      0.94      0.94       235
           2       0.89      0.99      0.94       253
           3       0.78      0.82      0.80       249
           4       0.86      0.75      0.80       255
           5       0.90      0.91      0.90       257
           6       0.92      0.88      0.90       260
           7       0.76      0.88      0.81       254
           8       0.84      0.84      0.84       221
           9       0.91      0.95      0.93       255
          10       0.87      0.80      0.83       277
          11       0.83      0.84      0.84       265
          12       0.85      0.80      0.83       241
          13       0.84      0.72      0.78       267
          14       0.80      0.79      0.80       247
          15       0.86      0.93      0.89       214

    accuracy                           0.85      3750
   macro avg       0.86      0.86      0.85      3750
weighted avg       0.86   