In [2]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os

np.random.seed(42)
tf.random.set_seed(42)

In [3]:
base_path = "data"
words_list = []
words = open(f"{base_path}/words.txt", "r").readlines()
for line in words:
    if line[0] == "#":
        continue
    if line.split(" ")[1] != "err":
        words_list.append(line)
print(len(words_list))

np.random.shuffle(words_list)

96456


### Splitting the dataset into three (Train, Validation and Test (90:5:5)) 

In [4]:
split_index = int(0.9 * len(words_list))
train_samples = words_list[:split_index]
test_samples = words_list[split_index:]

val_index = int(0.5 * len(test_samples))
val_samples = test_samples[:val_index]
test_samples = test_samples[val_index:]
assert len(words_list) == len(train_samples) + len(val_samples) + len(
    test_samples
)
print(f"Total training samples is {len(train_samples)}")
print(f"Total testing samples is {len(test_samples)}")
print(f"Total validation samples is {len(val_samples)}")

Total training samples is 86810
Total testing samples is 4823
Total validation samples is 4823


## Data input pipeline
#### We start building our data input pipeline by first preparing the image paths.


In [35]:
base_image_path = os.path.join(base_path, "words")
paths = []
corrected_samples = []
def get_img_paths_and_labels(samples):
    for i, file_line in enumerate(samples):
        line_split = file_line.strip()
        line_split = line_split.split(" ")
        # Each line split will have this format for the corresponding image:
        # part1/part1-part2/part1-part2-part3.png
        img_name = line_split[0]
        part1 = img_name.split("-")[0]
        part2 = img_name.split("-")[1]
        
        img_path = os.path.join(
            base_image_path, part1, part1 + "-" + part2, img_name + ".png")
        if os.path.getsize(img_path):
            paths.append(img_path)
            corrected_samples.append(file_line.split("\n")[0])
    return paths, corrected_samples
train_img_paths, train_labels = get_img_paths_and_labels(train_samples)
test_img_paths, test_labels = get_img_paths_and_labels(test_samples)
val_img_paths, val_labels = get_img_paths_and_labels(val_samples)

In [36]:
# Find maximum length and the size of the vocabulary in the training data.
train_labels_cleaned = []
characters = set()
max_len = 0
for label in train_labels:
    label = label.split(" ")[-1].strip()
    for char in label:
        characters.add(char)
    max_len = max(max_len, len(label))
    train_labels_cleaned.append(label)
print("Maximum length: ", max_len)
print("Vocab size: ", len(characters))

# Check some label samples.
train_labels_cleaned[:10]
print(characters)

Maximum length:  21
Vocab size:  78
{'Y', '.', 'g', 'L', '/', 'H', '&', 'J', 'I', 'p', 'z', 'C', 'i', 'k', ')', 'F', 'n', '8', 'a', '2', 'c', 'm', 'U', 'Z', '7', 'A', '3', '5', 'V', 'h', 'f', ';', '(', '-', '#', 'R', 't', 'Q', 'E', 'D', 'X', 'o', '4', '9', 'r', 'w', '0', '!', 'M', ',', 'b', '?', 'e', 'N', 'q', '6', 's', '"', 'j', '+', 'S', 'W', 'B', 'y', 'O', 'G', '*', "'", ':', 'x', 'T', '1', 'd', 'P', 'K', 'u', 'v', 'l'}


### clean the validation and the test labels as well

In [37]:
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    return cleaned_labels
val_labels_cleaned = clean_labels(val_labels)
test_labels_cleaned = clean_labels(test_labels)

['sure',
 'he',
 'during',
 'of',
 'booty',
 'gastronomy',
 'boy',
 'The',
 'and',
 'in',
 'his',
 'flying',
 'presented',
 'No',
 'as',
 'few',
 'work',
 'of',
 'items',
 'took',
 '"',
 ':',
 'in',
 '.',
 'sedate',
 'over',
 'much',
 'compelled',
 'in',
 'several',
 'a',
 'of',
 'said',
 'families',
 'While',
 'at',
 'capital',
 'A',
 'get',
 'a',
 'reasonably',
 'thought',
 'of',
 'shack',
 'method',
 '.',
 'was',
 'to',
 'killed',
 'stuff',
 'Tom',
 'Really',
 'civil',
 "brother's",
 'and',
 'any',
 'And',
 'I',
 'of',
 'interference',
 ',',
 'research',
 'to',
 'crochet',
 '.',
 'the',
 'enough',
 'her',
 ',',
 'at',
 'Powell',
 'are',
 'employing',
 'man',
 'perhaps',
 'death',
 'seem',
 'shares',
 'had',
 'something',
 'allowed',
 ',',
 'anti-Tory',
 'at',
 '.',
 'nothing',
 '!',
 'the',
 'had',
 ',',
 'Both',
 'Bills',
 'she',
 'and',
 '.',
 'to',
 'will',
 ',',
 ',',
 "'",
 '.',
 'Aubrey',
 'the',
 'Draughtsmen',
 '.',
 'forebears',
 'uses',
 'to',
 'look-out',
 'them',
 'afrai