# Optical Character Recognition

I did everything using Google Colab.

In [0]:
#@title
!pip install kaggle

from google.colab import files

from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

!kaggle competitions download -c optical-character-recognition-2018

TRAIN_IMAGES = "/content/.kaggle/competitions/optical-character-recognition-2018/trainImages.dmp"
TRAIN_LABELS = "/content/.kaggle/competitions/optical-character-recognition-2018/trainLabels.txt"
TEST_IMAGES = "/content/.kaggle/competitions/optical-character-recognition-2018/testImages.dmp"

## Imports

In [2]:
import numpy as np
import tensorflow as tf
import csv
import warnings

from scipy.misc import imresize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten

from matplotlib import pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [0]:
warnings.filterwarnings("ignore")

In [4]:
tf.test.is_gpu_available()

True

## Preparations

We'll write some helpful functions.

In [0]:
def read_dmp(data):
    """
        Gets the data from .dmp file
    """
    i = 0
    images = []
    while i < len(data):
        width = data[i]
        i += 1
        height = data[i]
        i += 1
        size = int(width) * height
        image = data[i:i + size]
        i += size
        images.append(image.reshape(width, height))
    return images

In [0]:
def create_dataset():
    """
        Creates dataset
    """
    y = []
    with open(TRAIN_LABELS, "rt") as csvfile:
        values = csv.reader(csvfile, delimiter=',')
        for value in values:
            y.append(value[1])
    y = y[1:]
    
    X = np.fromfile(TRAIN_IMAGES, dtype=np.uint8)
    X = read_dmp(X)
    
    X_test = np.fromfile(TEST_IMAGES, dtype=np.uint8)
    X_test = read_dmp(X_test)
    
    return X, y, X_test

In [0]:
def encode(data):
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(data)
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded, label_encoder

## Real work

First we need to prepare the data.

In [0]:
# Some magic numbers
SIZE = 28
SQUARE = SIZE * SIZE

In [0]:
# Collect raw data
X, y, X_test = create_dataset()

In [0]:
# Make images have same size
X = list(map(lambda image: np.ravel(imresize(image, (SIZE, SIZE))), X))
X_test = list(map(lambda image: np.ravel(imresize(image, (SIZE, SIZE))), X_test))

In [0]:
# One-hot encode labels
y, encoder = encode(y)

# Remember the number of classes
NUM_CLASSES = len(y[0])

In [0]:
# Splitting datatest for train and validation
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.33,
                                                  random_state=42)

In [0]:
# Keras likes numpy
X_train = np.array(X_train)
X_val = np.array(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)
X_test = np.array(X_test)

Let's write simple model first.

In [0]:
def build_model():
    model = Sequential()
    model.add(Dense(512, input_shape=(SQUARE,), activation='relu'))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    return model

In [0]:
model = build_model()

In [36]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               401920    
_________________________________________________________________
dense_4 (Dense)              (None, 316)               162108    
Total params: 564,028
Trainable params: 564,028
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=200, verbose=1)

Train on 134000 samples, validate on 66000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

Epoch 5/10
Epoch 6/10
Epoch 7/10

Epoch 8/10
Epoch 9/10
Epoch 10/10



<keras.callbacks.History at 0x7f2f84e02cc0>

In [0]:
prediction = model.predict(X_test)

In [0]:
fd = open('baselineNN.txt', 'w')
fd.write('Id,Category\n')
for i, label in enumerate(prediction):
    fd.write(str(i) + ',' + encoder.inverse_transform(np.argmax(label)) + '\n')
fd.close()

files.download('baselineNN.txt')  

Simple model gives more than 98%.

Now let's tune the architecture.

In [0]:
# Some magic numbers
SIZE = 32
SQUARE = SIZE * SIZE

In [0]:
# Collect raw data
X, y, X_test = create_dataset()

In [0]:
# Make images have same size
X = list(map(lambda image: imresize(image, (SIZE, SIZE)), X))
X_test = list(map(lambda image: imresize(image, (SIZE, SIZE)), X_test))

In [0]:
# One-hot encode labels
y, encoder = encode(y)

# Remember the number of classes
NUM_CLASSES = len(y[0])

In [0]:
# Splitting datatest for train and validation
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.33,
                                                  random_state=42)

In [0]:
# Keras likes numpy
X_train = np.array(X_train).reshape(-1, SIZE, SIZE, 1)
X_val = np.array(X_val).reshape(-1, SIZE, SIZE, 1)
y_train = np.array(y_train)
y_val = np.array(y_val)
X_test = np.array(X_test).reshape(-1, SIZE, SIZE, 1)

In [0]:
def build_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), input_shape=(SIZE, SIZE, 1), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Dropout(rate=0.3))
    model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    return model

In [0]:
model = build_model()

In [49]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 32, 32, 32)        320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 16, 16, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 8, 8, 64)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 8, 8, 64)          0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 8, 8, 128)         73856     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 4, 4, 128)         0         
__________

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [51]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=200, verbose=1)

Train on 134000 samples, validate on 66000 samples
Epoch 1/10
Epoch 2/10

Epoch 3/10
Epoch 4/10
 10600/134000 [=>............................] - ETA: 16s - loss: 0.1136 - acc: 0.9752

Epoch 5/10

Epoch 6/10
Epoch 7/10
 24200/134000 [====>.........................] - ETA: 15s - loss: 0.0881 - acc: 0.9843

Epoch 8/10


Epoch 9/10
Epoch 10/10



<keras.callbacks.History at 0x7f2f7b68a2b0>

In [0]:
prediction = model.predict(X_test)

In [0]:
fd = open('CNN.txt', 'w')
fd.write('Id,Category\n')
for i, label in enumerate(prediction):
    fd.write(str(i) + ',' + encoder.inverse_transform(np.argmax(label)) + '\n')
fd.close()

files.download('CNN.txt')  

One more time.

In [0]:
# Some magic numbers
SIZE = 32
SQUARE = SIZE * SIZE

In [0]:
# Collect raw data
X, y, X_test = create_dataset()

In [0]:
# Make images have same size
X = list(map(lambda image: imresize(image, (SIZE, SIZE)), X))
X_test = list(map(lambda image: imresize(image, (SIZE, SIZE)), X_test))

In [0]:
# One-hot encode labels
y, encoder = encode(y)

# Remember the number of classes
NUM_CLASSES = len(y[0])

In [0]:
# Splitting datatest for train and validation
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.33,
                                                  random_state=42)

In [0]:
# Keras likes numpy
X_train = np.array(X_train).reshape(-1, SIZE, SIZE, 1)
X_val = np.array(X_val).reshape(-1, SIZE, SIZE, 1)
y_train = np.array(y_train)
y_val = np.array(y_val)
X_test = np.array(X_test).reshape(-1, SIZE, SIZE, 1)

In [0]:
def build_model():
    model = Sequential()
    model.add(Conv2D(64, kernel_size=(3, 3), input_shape=(SIZE, SIZE, 1), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Dropout(rate=0.35))
    model.add(Conv2D(256, kernel_size=(4, 4), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Flatten())
    model.add(Dense(600, activation='relu'))
    model.add(Dense(400, activation='relu'))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    return model

In [0]:
model = build_model()

In [74]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 32, 32, 64)        640       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 16, 16, 64)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 16, 16, 128)       73856     
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 8, 8, 128)         0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 8, 8, 128)         0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 8, 8, 256)         524544    
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 4, 4, 256)         0         
__________

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [76]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=64, verbose=1)

Train on 134000 samples, validate on 66000 samples
Epoch 1/20

Epoch 2/20

Epoch 3/20
 19776/134000 [===>..........................] - ETA: 55s - loss: 0.0878 - acc: 0.9749

Epoch 4/20
 12864/134000 [=>............................] - ETA: 59s - loss: 0.0666 - acc: 0.9812

Epoch 5/20
 10176/134000 [=>............................] - ETA: 1:00 - loss: 0.0544 - acc: 0.9840

Epoch 6/20
  9280/134000 [=>............................] - ETA: 59s - loss: 0.0501 - acc: 0.9856

Epoch 7/20
  8896/134000 [>.............................] - ETA: 1:00 - loss: 0.0378 - acc: 0.9884

Epoch 8/20
  8768/134000 [>.............................] - ETA: 1:00 - loss: 0.0365 - acc: 0.9888

Epoch 9/20
  8768/134000 [>.............................] - ETA: 1:01 - loss: 0.0254 - acc: 0.9922

Epoch 10/20
  8640/134000 [>.............................] - ETA: 1:00 - loss: 0.0268 - acc: 0.9916

Epoch 11/20
  8512/134000 [>.............................] - ETA: 1:01 - loss: 0.0296 - acc: 0.9925

Epoch 12/20
  8512/134000 [>.............................] - ETA: 1:01 - loss: 0.0316 - acc: 0.9911

Epoch 13/20
  8640/134000 [>.............................] - ETA: 1:02 - loss: 0.0332 - acc: 0.9912

Epoch 14/20
  8640/134000 [>.............................] - ETA: 1:01 - loss: 0.0233 - acc: 0.9939

Epoch 15/20
  8640/134000 [>.............................] - ETA: 1:01 - loss: 0.0257 - acc: 0.9940

Epoch 16/20
  8640/134000 [>.............................] - ETA: 1:02 - loss: 0.0177 - acc: 0.9948

Epoch 17/20
  8384/134000 [>.............................] - ETA: 1:01 - loss: 0.0313 - acc: 0.9903

Epoch 18/20
  8512/134000 [>.............................] - ETA: 1:01 - loss: 0.0210 - acc: 0.9947

Epoch 19/20
  8512/134000 [>.............................] - ETA: 1:01 - loss: 0.0322 - acc: 0.9926

Epoch 20/20
  8512/134000 [>.............................] - ETA: 1:01 - loss: 0.0258 - acc: 0.9945



<keras.callbacks.History at 0x7f4ab45d5080>

In [0]:
prediction = model.predict(X_test)

In [0]:
fd = open('CNN_Second.txt', 'w')
fd.write('Id,Category\n')
for i, label in enumerate(prediction):
    fd.write(str(i) + ',' + encoder.inverse_transform(np.argmax(label)) + '\n')
fd.close()

files.download('CNN_Second.txt')  

In [0]:
# Some magic numbers
SIZE = 28
SQUARE = SIZE * SIZE

In [0]:
# Collect raw data
X, y, X_test = create_dataset()

In [0]:
# Make images have same size
X = list(map(lambda image: imresize(image, (SIZE, SIZE)), X))
X_test = list(map(lambda image: imresize(image, (SIZE, SIZE)), X_test))

In [0]:
# One-hot encode labels
y, encoder = encode(y)

# Remember the number of classes
NUM_CLASSES = len(y[0])

In [0]:
# Splitting datatest for train and validation
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.3,
                                                  random_state=42)

In [0]:
# Keras likes numpy
X_train = np.array(X_train).reshape(-1, SIZE, SIZE, 1)
X_val = np.array(X_val).reshape(-1, SIZE, SIZE, 1)
y_train = np.array(y_train)
y_val = np.array(y_val)
X_test = np.array(X_test).reshape(-1, SIZE, SIZE, 1)

In [0]:
def build_model():
    model = Sequential()
    model.add(Conv2D(64, kernel_size=(3, 3), input_shape=(SIZE, SIZE, 1), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Dropout(rate=0.35))
    model.add(Conv2D(256, kernel_size=(4, 4), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Flatten())
    model.add(Dense(600, activation='relu'))
    model.add(Dropout(rate=0.35))
    model.add(Dense(400, activation='relu'))
    model.add(Dropout(rate=0.35))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    return model

In [0]:
model = build_model()

In [46]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 28, 28, 64)        640       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 14, 14, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 14, 14, 128)       73856     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 7, 7, 128)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 7, 7, 128)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 7, 7, 256)         524544    
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 3, 3, 256)         0         
__________

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [49]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=15, batch_size=64, verbose=1)

Train on 140000 samples, validate on 60000 samples
Epoch 1/15

Epoch 2/15

Epoch 3/15
 16704/140000 [==>...........................] - ETA: 53s - loss: 0.1180 - acc: 0.9637

Epoch 4/15
  9536/140000 [=>............................] - ETA: 56s - loss: 0.0857 - acc: 0.9713

Epoch 5/15
  6848/140000 [>.............................] - ETA: 57s - loss: 0.0822 - acc: 0.9730

Epoch 6/15
  5824/140000 [>.............................] - ETA: 57s - loss: 0.0732 - acc: 0.9777

Epoch 7/15
  5440/140000 [>.............................] - ETA: 58s - loss: 0.0680 - acc: 0.9781

Epoch 8/15
  5312/140000 [>.............................] - ETA: 58s - loss: 0.0594 - acc: 0.9814

Epoch 9/15
  5184/140000 [>.............................] - ETA: 57s - loss: 0.0544 - acc: 0.9828

Epoch 10/15
  5184/140000 [>.............................] - ETA: 58s - loss: 0.0590 - acc: 0.9809

Epoch 11/15
  5184/140000 [>.............................] - ETA: 57s - loss: 0.0480 - acc: 0.9838

Epoch 12/15
  5056/140000 [>.............................] - ETA: 57s - loss: 0.0502 - acc: 0.9838

Epoch 13/15
  5056/140000 [>.............................] - ETA: 57s - loss: 0.0616 - acc: 0.9834

Epoch 14/15
  5056/140000 [>.............................] - ETA: 57s - loss: 0.0448 - acc: 0.9883

Epoch 15/15
  5056/140000 [>.............................] - ETA: 57s - loss: 0.0515 - acc: 0.9846



<keras.callbacks.History at 0x7f842be1a710>

In [0]:
prediction = model.predict(X_test)

In [0]:
fd = open('CNN_Third.txt', 'w')
fd.write('Id,Category\n')
for i, label in enumerate(prediction):
    fd.write(str(i) + ',' + encoder.inverse_transform(np.argmax(label)) + '\n')
fd.close()

files.download('CNN_Third.txt')  

In [0]:
# Some magic numbers
SIZE = 28
SQUARE = SIZE * SIZE

In [0]:
# Collect raw data
X, y, X_test = create_dataset()

In [0]:
# Make images have same size
X = list(map(lambda image: imresize(image, (SIZE, SIZE)), X))
X_test = list(map(lambda image: imresize(image, (SIZE, SIZE)), X_test))

In [0]:
# One-hot encode labels
y, encoder = encode(y)

# Remember the number of classes
NUM_CLASSES = len(y[0])

In [0]:
# Splitting datatest for train and validation
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.3,
                                                  random_state=42)

In [0]:
# Keras likes numpy
X_train = np.array(X_train).reshape(-1, SIZE, SIZE, 1)
X_val = np.array(X_val).reshape(-1, SIZE, SIZE, 1)
y_train = np.array(y_train)
y_val = np.array(y_val)
X_test = np.array(X_test).reshape(-1, SIZE, SIZE, 1)

In [0]:
def build_model():
    model = Sequential()
    model.add(Conv2D(64, kernel_size=(3, 3), input_shape=(SIZE, SIZE, 1), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='tanh'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Dropout(rate=0.35))
    model.add(Conv2D(256, kernel_size=(3, 3), padding='valid', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(Flatten())
    model.add(Dense(600, activation='relu'))
    model.add(Dropout(rate=0.35))
    model.add(Dense(400, activation='relu'))
    model.add(Dropout(rate=0.35))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    return model

In [59]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 28, 28, 64)        640       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 14, 14, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 14, 14, 128)       73856     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 7, 7, 128)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 7, 7, 128)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 7, 7, 256)         524544    
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 3, 3, 256)         0         
__________

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)

Train on 140000 samples, validate on 60000 samples
Epoch 1/10

Epoch 2/10

Epoch 3/10
 16832/140000 [==>...........................] - ETA: 53s - loss: 0.0437 - acc: 0.9884

Epoch 4/10
  9536/140000 [=>............................] - ETA: 56s - loss: 0.0632 - acc: 0.9842

Epoch 5/10
  6848/140000 [>.............................] - ETA: 58s - loss: 0.0542 - acc: 0.9861

Epoch 6/10
  5824/140000 [>.............................] - ETA: 58s - loss: 0.0445 - acc: 0.9870

Epoch 7/10
  5440/140000 [>.............................] - ETA: 58s - loss: 0.0457 - acc: 0.9849

Epoch 8/10
  5312/140000 [>.............................] - ETA: 58s - loss: 0.0472 - acc: 0.9876

Epoch 9/10
  5184/140000 [>.............................] - ETA: 58s - loss: 0.0405 - acc: 0.9878

Epoch 10/10
  5184/140000 [>.............................] - ETA: 58s - loss: 0.0615 - acc: 0.9846



<keras.callbacks.History at 0x7f81d0df8128>

In [0]:
prediction = model.predict(X_test)

In [0]:
fd = open('CNN_Forth.txt', 'w')
fd.write('Id,Category\n')
for i, label in enumerate(prediction):
    fd.write(str(i) + ',' + encoder.inverse_transform(np.argmax(label)) + '\n')
fd.close()

files.download('CNN_Forth.txt')  