In [1]:
import numpy as np
import pandas as pd
import cv2, os, random
from matplotlib import pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

In [2]:
data_dir = 'Chinese_MINST_Dataset'
csv_data = os.path.join(data_dir, 'chinese_mnist.csv')
imgs_dir = os.path.join(data_dir, 'data/data')
data = pd.read_csv(csv_data)
data.head()

Unnamed: 0,suite_id,sample_id,code,value,character
0,1,1,10,9,九
1,1,10,10,9,九
2,1,2,10,9,九
3,1,3,10,9,九
4,1,4,10,9,九


In [3]:
imgs_id = data.iloc[:,:3].values
imgs_label = data['value'].tolist()
imgs_name = []

#get image name
for i in range(len(imgs_id)):
    imgs_name.append(f'input_{imgs_id[i][0]}_{imgs_id[i][1]}_{imgs_id[i][2]}.jpg')

In [30]:

def get_train_test_set(imgs_id, imgs_name, train_size, test_size):
    imgs_lists = []

    # according to different value, divide original data into 15 lists
    for i in range(15):
        imgs_lists.append([])
    for i in range(len(imgs_id)):
        imgs_lists[imgs_id[i][2]-1].append(imgs_name[i])

    # Make sure that random sampling is done in a stratified way that each class has 333 pictures in training set
    train_class_size = train_size // 15
    test_class_size = test_size // 15

    # Read the same number of pictures from each list.
    imgs_train_set, imgs_test_set, train_labels, test_labels = [], [], [], []
    for i in range(15):
        train_sample = random.sample(imgs_lists[i], train_class_size)
        imgs_train_set.extend(train_sample)
        
        # Select the test set from the remaining samples.
        remain_sample = set(imgs_lists[i]) - set(train_sample)
        test_sample = random.sample(list(remain_sample), test_class_size)
        imgs_test_set.extend(test_sample)
    
        train_labels.extend([i+1]*train_class_size)
        test_labels.extend([i+1]*test_class_size)

    # Get train set and test set
    train_set = []
    test_set = []
    
    # read images and reshape
    for i in range(len(imgs_train_set)):
        img_name = os.path.join(imgs_dir, imgs_train_set[i])
        img = cv2.imread(img_name, cv2.IMREAD_GRAYSCALE)
        train_set.append(img.reshape(-1))
    for i in range(len(imgs_test_set)):
        img_name = os.path.join(imgs_dir, imgs_test_set[i])
        img = cv2.imread(img_name, cv2.IMREAD_GRAYSCALE)
        test_set.append(img.reshape(-1))

    return train_set, train_labels, test_set, test_labels

In [31]:
train_set, train_labels, test_set, test_labels = get_train_test_set(imgs_id, imgs_name, 5000, 1000)
larger_train_set, larger_train_labels, larger_test_set, larger_test_labels = get_train_test_set(imgs_id, imgs_name, 10000, 1000)

In [32]:
def display_proformance(labels, predict):
    f1 = f1_score(labels, predict, average='macro')
    accuracy = accuracy_score(labels, predict)
    precision = precision_score(labels, predict, average='macro')
    recall = recall_score(labels, predict, average='macro')
    confusion = confusion_matrix(labels, predict)
    
    print('f1_score: ', f1)
    print('accuracy_score: ', accuracy)
    print('precision: ', precision)
    print('recall_score: ', recall)
    print('confusion matrix: \n', confusion)

# KNN Classifier

In [33]:
# knn1's train set has 5000 images, knn2's train set has 10000 images
knn1 = KNeighborsClassifier(n_neighbors=3)
knn1.fit(train_set, train_labels)

knn2 = KNeighborsClassifier(n_neighbors=3)
knn2.fit(larger_train_set, larger_train_labels)

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [34]:
knn1_pred = knn1.predict(test_set)
knn2_pred = knn2.predict(larger_test_set)

display_proformance(test_labels, knn1_pred)
display_proformance(larger_test_labels, knn2_pred)

f1_score:  0.34049692751232086
accuracy_score:  0.3333333333333333
precision:  0.528857510862342
recall_score:  0.3333333333333333
confusion matrix: 
 [[31 17  0  0  4  1  3  1  0  1  2  1  2  3  0]
 [ 0 60  6  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 29 32  5  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 27 18 21  0  0  0  0  0  0  0  0  0  0  0]
 [ 3 26 12  5 15  1  1  3  0  0  0  0  0  0  0]
 [ 0 15 25 14  0  8  1  0  0  2  0  0  1  0  0]
 [ 1 50  3  1  0  0  9  0  1  1  0  0  0  0  0]
 [ 0 26  6  0  0  1  5 16  0  0 10  0  2  0  0]
 [ 0 18  1  0  0  2  0  0 45  0  0  0  0  0  0]
 [ 0 25  4  3  0  3 12  4  0 11  1  0  0  3  0]
 [ 0 21  2  2  0  0  0  1  0  0 33  0  7  0  0]
 [ 1 16  6  8  7  3  3  2  3  1  1  5  3  7  0]
 [ 0 21  4  2  0  0  1  0  0  0 22  0 16  0  0]
 [ 4 35  4  1  0  2  5  0  1  0  0  0  0 14  0]
 [ 0 19 11  2  1  3  8  3  1  1  1  0  0  2 14]]
f1_score:  0.43726359161409484
accuracy_score:  0.42323232323232324
precision:  0.6090719316346422
recall_score:  0.423232323232

### As the training set grows larger, the performance of the KNN classifier improves.

# Decision Tree classifier

In [35]:
# dt1's train set has 5000 images, dt2's train set has 10000 images
dt1 = DecisionTreeClassifier()
dt1.fit(train_set, train_labels)

dt2 = DecisionTreeClassifier()
dt2.fit(train_set, train_labels)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [36]:
dt1_pred = dt1.predict(test_set)
dt2_pred = dt2.predict(larger_test_set)

display_proformance(test_labels, dt1_pred)
display_proformance(larger_test_labels, dt2_pred)

f1_score:  0.23722359386723696
accuracy_score:  0.2404040404040404
precision:  0.2392829601101868
recall_score:  0.24040404040404037
confusion matrix: 
 [[24  0  0  2  4  6  4  4  0  2  4  8  4  3  1]
 [ 0 49  1  1  2  2  2  1  2  3  2  0  0  1  0]
 [ 0  7 19  8  0  1  2  4  7  6  5  0  1  3  3]
 [ 0  4  9  7  2  9  6  5  2  2  4  3  4  6  3]
 [ 2  1  1  4 14  8  2  2  6  3  1  6  6  4  6]
 [ 2  2  7  7  3  9  4  8  3  5  0  3  5  4  4]
 [ 0  0  7  6  3  3 15  5  2  8  0  4  4  5  4]
 [ 1  2  4  3  5  3  7  9  7 12  2  2  4  2  3]
 [ 0  5  2  4  4  3  9  2 21  4  4  1  1  3  3]
 [ 1  2  4  3  7  1  6  8  3 10  4  1  4  4  8]
 [ 1  2  1  2  2  0  2  0  2  4 30  2 12  5  1]
 [ 4  0  2  4  0  8  1  4  0  3  8  6 12  9  5]
 [ 4  0  3  5  3  2  5  0  0  4 18  3 11  5  3]
 [ 5  2  3  4  7  7  6  2  3  6  4  4  6  7  0]
 [ 1  1  3  4  5  8  7  5  4  9  3  3  2  4  7]]
f1_score:  0.543810117096307
accuracy_score:  0.5444444444444444
precision:  0.5472831272968192
recall_score:  0.5444444444444

### As the training set grows larger, the performance of the DT classifier improves.

# SGD Classifier

In [37]:
# sgd1's train set has 5000 images, sgd2's train set has 10000 images
sgd1 = SGDClassifier(max_iter=250)
sgd1.fit(train_set, train_labels)

0,1,2
,loss,'hinge'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,250
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [40]:
sgd2 = SGDClassifier(max_iter=250)
sgd2.fit(larger_train_set, larger_train_labels)

0,1,2
,loss,'hinge'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,250
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [41]:
sgd1_pred = sgd1.predict(test_set)
sgd2_pred = sgd2.predict(larger_test_set)

display_proformance(test_labels, sgd1_pred)
display_proformance(larger_test_labels, sgd2_pred)

f1_score:  0.2652811762386925
accuracy_score:  0.26666666666666666
precision:  0.2756485732805127
recall_score:  0.2666666666666666
confusion matrix: 
 [[36  0  0  1  4  3  2  3  0  2  3  6  4  0  2]
 [ 1 24 15  1  4  1  2  3  4  1  3  2  1  4  0]
 [ 1 10 19  6  1  9  2  3  1  3  1  3  4  0  3]
 [ 1  2 13 16  1  9  4  3  2  1  2  4  6  0  2]
 [ 4  6  5  0 11  5  4  1  3  6  4  4  2  2  9]
 [ 2  0 14 10  0 19  5  1  2  4  2  3  1  1  2]
 [ 9  1  5  3  6 10  7  4  2  2  0  2  6  2  7]
 [ 2  3 12  0  1  7  0 14  3  2  4  4  4  4  6]
 [ 4  5 13  0  1  4  2  5 22  2  2  1  3  0  2]
 [ 0  2  4  6  1  4  2  5  5 17  3  3  4  3  7]
 [ 2  5  9  2  1  4  0  4  1  3 24  3  6  0  2]
 [ 5  1  6  3  5 12  3  4  1  0  7 13  2  2  2]
 [ 2  0  6  3  0  6  3  4  1  1 15  2 16  2  5]
 [ 6  0  5  1  2  3  2  3  1  6  7 10  5  9  6]
 [ 1  0  4  3  5  3  7  4  4  9  3  2  1  3 17]]
f1_score:  0.3034342807004373
accuracy_score:  0.30606060606060603
precision:  0.3160646276988521
recall_score:  0.306060606060

### As the training set grows larger, SGD classifier gets better proformance.