I - Installs & Imports

In [None]:
### Installs ###

# !pip install --upgrade pip
# !pip install keras_preprocessing
# !pip install keras.utils
# !pip install np_utils

In [15]:
### Imports ###

import opendatasets as od                
import os

import pandas as pd
import numpy as np

from keras_preprocessing import image  
from keras.utils import to_categorical
 
import PIL
from PIL import Image


II - Data collecting

We first import our data from kaggle. To do so we use the download function from the opedataset module. A kaggle API will require us to input our credentials. Here is how to obtain them : 

1) sign in to kaggle
2) go to the API section
3) select the "create token" option, a jason file will be downloaded
4) enter first your usename then the key, both without the double apostrophes

In [None]:
od.download("https://www.kaggle.com/datasets/gpreda/chinese-mnist/download?datasetVersionNumber=7")

The above cell tells us a directory where the dataset has been downloaded, we need to use the os module to get the name of the different files in the dataset as follows :

In [None]:
datadir = '.\chinese-mnist'
os.listdir(datadir)

In [None]:
filename = datadir + '\chinese_mnist.csv'
chinese_mnist_dtf = pd.read_csv(filename)

chinese_mnist_dtf.head(15)

# suite_id = volunteer_id is in range(1,100)
# sample_id is in range(1,10)
# code is in range(1,15)

Our Images are in the directory : '.\chinese-mnist\data\data'. 
Each files name is of the form : 'input_volunteer_sample_number.jpg' where volunteer range from 1 to 100, sample from 1 to 10 and number from 1 to 15

Let's create a list of the pixel value matrices of our images

In [16]:
### We transform the images to pixel values matrices stored in a dictionary ###

data = {}

for nb in range(1,16) :
    for vol in range(1,101) :
        for sample in range(1,11) :
            
            img_path = ".\chinese-mnist\data\data\input_" + str(vol) + "_" + str(sample) + "_" + str(nb) +".jpg" 
            img = image.load_img(img_path, target_size=(64, 64), color_mode = 'grayscale')
            px_mat = image.img_to_array(img)

            if nb < 10 : 
                data[str(vol) +'_'+ str(sample) +'_0' + str(nb)] = px_mat

            if nb >= 10 :  
                data[str(vol) +'_'+ str(sample) +'_' + str(nb)] = px_mat



### line to show the image from the pixel value matrix ###
            
# Image.fromarray(x[:,:,0]).show() with x a pixel value matrix

III - Data preprocessing

Test set :

In [17]:
# Let's extract a test set with the same number of occurrences for each outputs 
# We want 20% of our data in this test set that is 3000 images

np.random.seed(0)  # For reproductibility

test_keys = []

for nb in range(1,16) :

    for vol in range(1,101) :

        spl_indices = np.random.choice(np.arange(1,11), 2, replace = False)
        for spl in list(spl_indices) :

            if nb < 10 : 
                test_keys.append(str(vol) +'_' + str(spl)+ '_0' + str(nb))

            if nb >= 10 :
                test_keys.append(str(vol) +'_' + str(spl)+ '_' + str(nb))
        
test_set = []

for key in  test_keys :

    test_set.append((data[key], key))

np.random.shuffle(test_set)

Train set :

In [18]:
### Let's get our training set

train_keys = list(data.keys())

for key in test_keys :

    train_keys.remove(key)

train_set = []
for key in  train_keys :

    train_set.append((data[key], key))

np.random.shuffle(train_set)

Batches if needed :

In [20]:
## With batches
    
train_keys.sort(key = lambda x : int(x[-2:])) #each output appears 990 times in the list

# Let's now extract our training set in five batches 

batch1 = []
batch2 = []
batch3 = []
batch4 = []
batch5 = []

for nb in range(15) : 

    indices_binned = np.random.choice(800, (5,160), replace = 'False')

    for k in range(160) :
        
        key1 = train_keys[indices_binned[0][k] + nb*800]
        key2 = train_keys[indices_binned[1][k] + nb*800]
        key3 = train_keys[indices_binned[2][k] + nb*800]
        key4 = train_keys[indices_binned[3][k] + nb*800]
        key5 = train_keys[indices_binned[4][k] + nb*800]

        batch1.append((data[key1], key1))
        batch2.append((data[key2], key1))
        batch3.append((data[key3], key1))
        batch4.append((data[key4], key1))
        batch5.append((data[key5], key1))



np.random.shuffle(batch1)
np.random.shuffle(batch2) 
np.random.shuffle(batch3) 
np.random.shuffle(batch4) 
np.random.shuffle(batch5) 

IV - Preprocess function 

In [None]:
def preprocess(batch) : 
    """ Function that take for argument a dictionary of images and return a tuple made of the image in the key both in the right format to be passed in the CNN"""

    nb_input = len(batch)  #
    X = np.zeros((nb_input,64,64,1))
    Y = np.zeros((nb_input))
    
    for i in range(nb_input) :
        
        X[i] = batch[i][0]
        Y[i] = float(batch[i][1][-2:]) - 1

    X = X.reshape(len(X),1,64,64)
    X.astype('float32')/255

    Y = to_categorical(Y, num_classes=15)
    Y = Y.reshape(nb_input,15,1)
    
    return X,Y