In [None]:
import tensorflow as tf
from keras.preprocessing import image
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, RepeatVector, Dense, Activation, Lambda
from keras.models import Model

In [None]:
import matplotlib.pylab as plt
import os
from PIL import Image, ImageFilter
import h5py
import tarfile
from google.colab import files
import numpy as np
import cv2
import time
from sklearn.model_selection import train_test_split
import csv  
from tqdm import tqdm

In [None]:
!pip install tensorflowjs
import tensorflowjs as tfjs

In [None]:
# Pytest for testing the functions
!pip install pytest

In [None]:
'''
   Dataset is obtained from file 'IIIT5K-Word_V3.0.tar.gz'.
   The Dataset contains several files and 2 folders train and test.
   Files named traindata.m and testdata.m contains all the information related
       to the train and test images and their annotations respectively.
'''
os.chdir('/content')
tar = tarfile.open('IIIT5K-Word_V3.0.tar.gz')
tar.extractall()
tar.close()

In [None]:
os.chdir('IIIT5K')
base_dir = os.getcwd()
train_dir = os.path.join(base_dir, 'train')
test_dir = os.path.join(base_dir,'test')
print(train_dir)

In [None]:
num_imgs_train = len(os.listdir(train_dir))
num_imgs_test = len(os.listdir(test_dir))
print('''
  Dataset Before Pre-processing
''')
print("number of images in training dataset is {}".format(num_imgs_train))
print("number of images in testing dataset is {}".format(num_imgs_test))

In [None]:
os.chdir('/content/IIIT5K')

In [None]:
def label_generation(filename):

  os.chdir('/content/IIIT5K')
  text_file = open(filename, "r")
  text = text_file.readlines()
  text_list = [ word[0:len(word)-1] for word in text]

  #Y = np.array([string_to_int(word,T_Y,vocab) for word in text_list])

  return text_list

In [None]:
train_labels = label_generation("Train_labels.txt")
print(f"few of the dataset labels are {train_labels[0:3]}")
train_label_img_locations = label_generation("Train_labels_img_loc.txt")
print(f"few of the image locations are{train_label_img_locations[0:3]}")
dataset = []

i=0
for image in train_label_img_locations:
  dataset.append((image,train_labels[i]))
  i+=1
print(f"few of the image to image-locations are {dataset[0:3]}")

In [None]:
def rotateImages(rotationAmt):
  '''
    rotateImages is used as one of the image augumentation techniques to 
    increase the dataset thereby increasing the accuracy.

    rotateImages function rotates images in the current directory.

   Args:
   rotationAmt : int. The value of rotation in the image.
   
  '''
  # for each image in the current directory
  i = 0
  for image in train_label_img_locations:
    # check if the image is already rotated. 
    if (image.find("rot") == -1): 
      img = Image.open(image)
      # get the image name
      img_name = list(image.split("."))[0]
      rotimg = img.rotate(rotationAmt)
      # saving the image with its rotation information as well.
      rotimg.save(img_name+"rot"+str(rotationAmt)+".jpg")
      img.close()  
      train_labels.append(train_labels[i])
      dataset.append((img_name+"rot"+str(rotationAmt)+".jpg",train_labels[i]))
      i+=1


In [None]:
''' Unit tests for testing rotateImages '''


In [None]:
def addBlur():
  '''
    Adds Blur to the images.
    This function will list out all the images in the current directory and 
    applies blur to the image and saves it in the same folder.
  '''
  # for each image in the current directory
  i = 0
  # for each image in the current directory
  for image in train_label_img_locations:
    img = Image.open(image)
    # adds blur to the image using ImageFilter.Blur
    blured_image = img.filter(ImageFilter.BLUR)
    # get the image name
    img_name = list(image.split("."))[0]
    # saving the image by adding the blur feature.
    blured_image.save(img_name+"blur.jpg")
    train_labels.append(train_labels[i])
    dataset.append((img_name+"blur.jpg",train_labels[i]))
    i+=1


In [None]:
def preprocessing():
  '''
  preprocessing techniques such as image rotation and image blur are applied to 
  the images in the current directory.
  '''
  # Rotating images with an angle of 15 deg.
  rotateImages(15)
  # Rotating images with an angle of -15 deg.
  rotateImages(-15)
  # Blur images in the current Directory
  addBlur()

In [None]:
os.chdir('train')
# Preprocessing the images located in the train directory 
preprocessing()
os.chdir('/content/IIIT5K')

In [None]:
print(f"few of the dataset labels are {train_labels[2000:2003]}")
print(f"few of the image to image-locations are {dataset[2000:2003]}")
print(('1009_2'+'blur.jpg', 'YOU') in dataset)
print(('1009_2'+'rot15.jpg', 'YOU') in dataset)
print(('1009_2'+'rot-15.jpg', 'YOU') in dataset)

In [None]:
num_imgs_train = len(os.listdir(train_dir))
num_imgs_test = len(os.listdir(test_dir))
print('''
  Dataset After Pre-processing
''')
print("number of images in training dataset is {}".format(num_imgs_train))
print("number of images in testing dataset is {}".format(num_imgs_test))

In [None]:
print(len(dataset))

In [None]:
'''
  loading inception_resnet_v2 trained on imagenet dataset as per https://arxiv.org/pdf/1704.03549.pdf
  inception_resnet_v2 model is used as a feature extractor. 
  Later the features obtained are then passed to sequence to sequence model ( attention model ).
'''
pre_trained_model = tf.keras.applications.InceptionResNetV2(include_top=True, weights='imagenet', pooling=None)


In [None]:
# freezing the weights of the model and removing the last layer
pre_trained_model.trainable = False
pre_trained_model.summary()

In [None]:
def load_and_process_img(image_path):
  '''
    Loads image in the Argument and and converts to size=(229,229,3) and
    returns numpy.ndarray used for getting features from the pre_trained_model.

    Args : 
    image_path : str. location path of the image

    Return : 
    img : numpy.ndarray . pre-processed image for passing into pre_trained_model
  '''

  # Load image with resizing it to a size of (229,229,3) 
  img = image.load_img(image_path, target_size=(299,299, 3))
  img = image.img_to_array(img)
  img = np.expand_dims(img, axis=0)
  img = tf.keras.applications.inception_v3.preprocess_input(img)
  return img, image_path

In [None]:

# Getting important layers for feature extraction from pre_trained_model.
# The layers are selected by referring to https://arxiv.org/pdf/1704.03549.pdf
imp_layers = ['mixed_7a','block8_1_conv']

# Taking out features from layer 'mixed_7a'
# with reference to the paper in https://arxiv.org/pdf/1704.03549.pdf
# an accuracy of 0.819 is achieved with the layer 'mixed_7a'

layer_name = 'block8_1_conv'

# Taking output from 'mixed_7a'
layer_output = pre_trained_model.get_layer(layer_name).output

# Generating a substance model from the pre_trained_model.
# model with input layer of pre_trained_model and output layer of 'mixed_7a'
feature_extraction_model = tf.keras.Model(inputs=pre_trained_model.input, outputs=layer_output)

In [None]:
export_path_keras = "./feature_extraction_model.h5"
print(export_path_keras)

feature_extraction_model.save(export_path_keras)

In [None]:
export_path = "./feature_extraction_model"
print(export_path)

tf.saved_model.save(feature_extraction_model,export_path)

In [None]:
!zip -r mode.zip {export_path}

In [None]:
files.download('./mode.zip')

In [None]:
os.chdir('/content')

In [None]:
!tensorflowjs_converter --input_format=keras ./feature_extraction_model/feature_extraction_model.h5 ./web_feature_extractor

In [None]:
!zip -r js_model.zip web_feature_extractor

In [None]:
files.download('js_model.zip')

In [None]:
#os.chdir('train')

# Loading and processing a image '1009_2.png'
input_data = load_and_process_img('1009_2.png')

# getting features from the feature_extraction_model
result = feature_extraction_model.predict(input_data)

print(list(result.shape))
(m,n_H,n_W,n_C) = result.shape

# Un-Rolling the 4D image to 3D image
reshaped_result = tf.reshape(result, shape=(m,n_H*n_W,n_C))

print(reshaped_result.shape)
os.chdir('/content/IIIT5K')

In [None]:
os.chdir('/content/IIIT5K/train')

print(len(os.listdir(os.getcwd())))

In [None]:
'''
Need to check this fucntion. May not be needed
'''

def load_and_process_dataset():
  '''
    Loads and processes the images in the dataset and passes them through the
    feature_extraction_model to get the features of each image and stores them 
    in fefatures folder. 
  '''
  layer_name = 'mixed_7a'
  layer_output = pre_trained_model.get_layer(layer_name).output
  feature_extraction_model = tf.keras.Model(inputs=pre_trained_model.input, outputs=layer_output)
  i=1
  start = 0
  end = 8000
  with tqdm(total=100) as pbar:
    for (path_to_img,label) in dataset[start:end]:
      img = image.load_img(path_to_img, target_size=(299,299, 3))
      img = image.img_to_array(img)
      img = np.expand_dims(img, axis=0)
      result = feature_extraction_model.predict(input_data)
      (m,n_H,n_W,n_C) = result.shape
      reshaped_result = tf.reshape(result, shape=(m,n_H*n_W,n_C))
      t2n = tf.make_tensor_proto(reshaped_result)
      with open(r'features_dataset', 'a') as f:
        writer = csv.writer(f)
        writer.writerow(tf.make_ndarray(t2n))
      pbar.update((i/8000)*100)
      #result.save("./features/img_"+str(i)+"_feature")
  pbar.close()


In [None]:
'''
May not be needded
'''
#os.chdir('train')
load_and_process_dataset()
os.chdir('/content/IIIT5K')

In [None]:
'''
May not be needded
'''
os.chdir('train')
print("features_dataset" in os.listdir(os.getcwd()))

In [None]:
'''
May not be needded
'''
!zip -r /content/features_dataset.zip features_dataset

In [None]:

'''	Plotting Various Features obtained from feature_extraction_model. '''

plot_limit = 8
index = 1
i=1500
for _ in range(plot_limit):
	for _ in range(plot_limit):
		# specify subplot and turn of axis
		ax = plt.subplot(plot_limit, plot_limit, index)
		ax.set_xticks([])
		ax.set_yticks([])
		# plot filter channel in grayscale
		plt.imshow(result[0, :, :, i-1], cmap='gray')
		index += 1;i-=1
# show the figure
plt.show()
# Saving the Plot for future reference.
plt.savefig('Activations.png')

In [None]:
# PLoting various features from the feature_extraction_model
plot_limit = 8
index = 1
layer_no = 1087
for _ in range(plot_limit):
	for _ in range(plot_limit):
		# specify subplot and turn of axis
		ax = plt.subplot(plot_limit, plot_limit, index)
		ax.set_xticks([])
		ax.set_yticks([])
		# plot filter channel in grayscale
		plt.imshow(result[0, :, :, layer_no], cmap='gray')
		index += 1;layer_no -= 1;
# show the figure
plt.show()
plt.savefig('Activations.png')

In [None]:
def string_to_int(string, length, vocab):
  '''
    Converts words to list of numbers

    Args : 
    string : str. The word which is to be converted to list of numbers.
    length : int. Max length limit of the list. strings over length are removed.
    vocab : dict. Dictonary which contains letter to number encoding.

    Returns : 
    rep : list. a list of numbers representing the word
  '''
  
  # converting the word to lowercase
  string = string.lower()

  # neglect the letters of the word, if the length of the word is
  # greater than the threshold(length)
  if len(string) > length:
      string = string[:length]
  rep = list(map(lambda x: vocab.get(x, '<unk>'), string))

  # Add padding for the letters of the word if the length of the word is
  # less than the threshold(length)
  if len(string) < length:
      rep += [vocab['<pad>']] * (length - len(string))

  return rep

In [None]:
# vocabulary has been taken from internet 
vocab = {
          ' ': 0, '.': 1, '/': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8,
         '6': 9, '7': 10, '8': 11, '9': 12, 'a': 13, 'b': 14, 'c': 15, 'd': 16, 
         'e': 17, 'f': 18, 'g': 19, 'h': 20, 'i': 21, 'j': 22, 'l': 23, 'm': 24,
         'n': 25, 'o': 26, 'p': 27, 'r': 28, 's': 29, 't': 30, 'u': 31, 'v': 32,
         'w': 33, 'y': 34, '<unk>': 35, '<pad>': 36
}

In [None]:
def load_output(filename,T_Y,vocab):

  os.chdir('/content/IIIT5K')
  text_file = open(filename, "r")
  text = text_file.readlines()
  text_list = [ word[0:len(word)-1] for word in text]

  Y = np.array([string_to_int(word,T_Y,vocab) for word in text_list])

  return Y

In [None]:
output_array = load_output("Train_labels.txt",20,vocab)

In [None]:
n_a = 64 # number of units for the pre-attention, bi-directional LSTM's hidden state 'a'
n_s = 120 # number of units for the post-attention, bi-directionsl LSTM's hidden state "s"
T_X = list(reshaped_result.shape)[1]
feature_length = list(reshaped_result.shape)[2] 
T_Y = 20
print(T_X,feature_length)

In [None]:
# Defined shared layers as global variables
repeat = RepeatVector(T_X)
concat = Concatenate(axis=-1)
dense1 = Dense(510, activation = "tanh")
dense2 = Dense(1, activation = "relu")
activation = Activation(activation="softmax", name='attention_weights')
dot = Dot(axes = 1)

In [None]:
def attention(a, s_prev):
  s_prev = repeat(s_prev)
  concat_value = concat([a,s_prev])
  e = dense1(concat_value)
  energies = dense2(e)
  alphas = activation(energies)
  context = dot([alphas,a])
  return context

In [None]:
post_activation_LSTM_cell = tf.keras.layers.GRU(512
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
output_layer = Dense(20, activation="softmax")

In [None]:
def seq2seq_model(T_X, T_Y, n_a, n_s,feature_length):

  X = Input(shape=(T_X, feature_length))
  s0 = Input(shape=(n_s,), name='s0')
  c0 = Input(shape=(n_s,), name='c0')
  s = s0
  c = c0
  outputs = []
  tf.expand_dims(s, axis = 1).shape.as_list()
  print(s,c)

  a = tf.keras.layers.GRU(512,
                          return_sequences=True,
                          return_state=True,
                          recurrent_initializer='glorot_uniform')
  for t in range(T_Y):
        context = attention(a,s)
        print(context)
        s, _, c = post_activation_LSTM_cell(context)
        out = output_layer(s)
        outputs.append(out)

  model = Model(inputs = [X,s0,c0], outputs = outputs)
  return model

In [None]:
model = seq2seq_model(T_X, T_Y, n_a, n_s, feature_length)

In [None]:
class Attention(tf.keras.Model):
  def __init__(self,units):
    super(BahdanauAttention, self).__init__()
    self.fc1 = tf.keras.layers.Dense(units)
    self.fc2 = tf.keras.layers.Dense(1)

  def call():
    

In [None]:
img_name_vector = []
img_label_name = []

for (image_name,label) in dataset:
  img_name_vector.append(image_name)
  img_label_name.append(label)

In [None]:
encode_train = sorted(set(img_name_vector))
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(
  load_and_process_img, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

for img, path in image_dataset:
  batch_features = image_features_extract_model(img)
  batch_features = tf.reshape(batch_features,
                              (batch_features.shape[0], -1, batch_features.shape[3]))

  for bf, p in zip(batch_features, path):
    path_of_feature = p.numpy().decode("utf-8")
    np.save(path_of_feature, bf.numpy())

In [None]:
img_name_train, img_name_val, output_label_train, output_label_val = train_test_split(
                                                                    img_name_vector,
                                                                    img_label_name,
                                                                    test_size=0.2,
                                                                    random_state=0)

len(img_name_train), len(output_label_train), len(img_name_val), len(output_label_val)

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
vocab_size = top_k + 1
num_steps = len(img_name_train) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

In [None]:
def map_func(img_name, cap):
  img_tensor = np.load(img_name.decode('utf-8')+'.npy')
  return img_tensor, cap

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [None]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, features, hidden):
    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

    # hidden shape == (batch_size, hidden_size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
    hidden_with_time_axis = tf.expand_dims(hidden, 1)

    # score shape == (batch_size, 64, hidden_size)
    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

    # attention_weights shape == (batch_size, 64, 1)
    # you get 1 at the last axis because you are applying score to self.V
    attention_weights = tf.nn.softmax(self.V(score), axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * features
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
class RNN_Decoder(tf.keras.Model):
  def __init__(self, embedding_dim, units, vocab_size):
    super(RNN_Decoder, self).__init__()
    self.units = units

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.units)

  def call(self, x, features, hidden):
    # defining attention as a separate model
    context_vector, attention_weights = self.attention(features, hidden)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # shape == (batch_size, max_length, hidden_size)
    x = self.fc1(output)

    # x shape == (batch_size * max_length, hidden_size)
    x = tf.reshape(x, (-1, x.shape[2]))

    # output shape == (batch_size * max_length, vocab)
    x = self.fc2(x)

    return x, state, attention_weights

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

In [None]:
embedding_dim = 256
units = 512
vocab_size = len(list(vocab.keys()))
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [None]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
  # restoring the latest checkpoint in checkpoint_path
  ckpt.restore(ckpt_manager.latest_checkpoint)

In [None]:
loss_plot = []

In [None]:
@tf.function
def train_step(img_tensor, target):
  loss = 0

  # initializing the hidden state for each batch
  # because the captions are not related from image to image
  hidden = decoder.reset_state(batch_size=target.shape[0])

  dec_input = tf.expand_dims([1] * target.shape[0], 1)
  
  with tf.GradientTape() as tape:
      features = encoder(img_tensor)

      for i in range(1, target.shape[1]):
          # passing the features through the decoder
          predictions, hidden, _ = decoder(dec_input, features, hidden)

          loss += loss_function(target[:, i], predictions)

          # using teacher forcing
          dec_input = tf.expand_dims(target[:, i], 1)

  total_loss = (loss / int(target.shape[1]))

  trainable_variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, trainable_variables)

  optimizer.apply_gradients(zip(gradients, trainable_variables))

  return loss, total_loss

In [None]:
EPOCHS = 20

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
              epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    if epoch % 5 == 0:
      ckpt_manager.save()

    print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                         total_loss/num_steps))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
!git clone "https://github.com/oh-my-ocr/text_renderer"

In [None]:
os.chdir("text_renderer")


In [None]:
with open(r'name', 'a') as f:
  for i in range(12):
    writer = csv.writer(f)
    writer.writerow([i])

In [None]:
!python3 setup.py develop

In [None]:
!pip3 install -r docker/requirements.txt

In [None]:
!python3 main.py \
    --config example_data/example.py \
    --dataset img \
    --num_processes 2 \
    --log_period 10

In [None]:
!zip -r Dataset.zip /content/output

In [None]:
files.download('Dataset.zip')

In [None]:
!git clone --single-branch --branch python3 "https://github.com/ankush-me/SynthText.git" 