<a href="https://colab.research.google.com/github/ujjawalsingh10/Neural-Machine-Translation/blob/main/Neural_Machine_Translation_with_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.layers import InputLayer, MaxPool2D, Dense, Conv2D, Conv1D, Flatten, BatchNormalization, TextVectorization,SimpleRNN, Embedding, Input,Bidirectional, LSTM, Dropout, GRU
from google.colab import drive
import re
import string
from numpy import random
import gensim.downloader as api
import datetime
from tensorboard.plugins import projector
import os
import pandas as pd
from tensorflow.keras import Model

### Data Preparation

In [3]:
!wget http://www.manythings.org/anki/fra-eng.zip

--2023-07-17 22:53:19--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7420323 (7.1M) [application/zip]
Saving to: ‘fra-eng.zip’


2023-07-17 22:53:20 (20.5 MB/s) - ‘fra-eng.zip’ saved [7420323/7420323]



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# !unzip '/content/fra-eng.zip' -d '/content/drive/MyDrive/Deep_Learning/NLP/Neural_Machine_Translation_with_RNN/dataset'

### Data Processing

In [6]:
### To convert our dataset into TensorFlow dataset types for easy manipulation
text_dataset = tf.data.TextLineDataset('/content/drive/MyDrive/Deep_Learning/NLP/Neural_Machine_Translation_with_RNN/dataset/fra.txt')

In [7]:
for i in text_dataset.take(3):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [8]:
VOCAB_SIZE = 20000
ENGLISH_SEQUENCE_LENGTH = 64
FRENCH_SEQUENCE_LENGTH = 64
EMBEDDING_DIM = 300
BATCH_SIZE = 64

In [9]:
# ### We can check last of the elements to see what can be the max size of the sentences
# for i in text_dataset.skip(190000):
#   print(len(tf.strings.split(i, ' ')))

In [10]:
english_vectorize_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = ENGLISH_SEQUENCE_LENGTH
)

In [11]:
french_vectorize_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = FRENCH_SEQUENCE_LENGTH
)

In [12]:
for i in text_dataset.take(1):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)


We have to create one vocabulary for English and another for Hindi

In [13]:
### We create this method to get data in x,y format and get rid of the extras z
## We add tokens and change the dataset to 3 input type
def selector(input_text):
  split_text = tf.strings.split(input_text, '\t')
  return {'input_1' : split_text[0:1], 'input_2' : 'starttoken '+split_text[1:2] }, split_text[1:2]+' endtoken'

In [14]:
split_dataset = text_dataset.map(selector)

In [15]:
def separator(input_text):
  split_text = tf.strings.split(input_text, '\t')
  return split_text[0:1], 'starttoken '+ split_text[1:2]+ ' endtoken'

In [16]:
init_dataset = text_dataset.map(separator)

In [17]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [18]:
english_training_dataset = init_dataset.map(lambda x,y : x)
english_vectorize_layer.adapt(english_training_dataset)

In [19]:
french_training_dataset = init_dataset.map(lambda x,y : y)
french_vectorize_layer.adapt(french_training_dataset)

In [20]:
# def vectorizer(english, french):
#   return english_vectorize_layer(english), french_vectorize_layer(french)
def vectorizer(inputs, output):
  return {'input_1': english_vectorize_layer(inputs['input_1']),
          'input_2': french_vectorize_layer(inputs['input_2'])}, french_vectorize_layer(output)

In [21]:
dataset = split_dataset.map(vectorizer)

In [31]:
for i in dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[ 793,   63,    0, ...,    0,    0,    0],
       [   2,  328,    0, ...,    0,    0,    0],
       [   2, 1069,    0, ...,    0,    0,    0],
       ...,
       [  44,  114,    0, ...,    0,    0,    0],
       [ 158,  431,    0, ...,    0,    0,    0],
       [  27, 3295,    0, ...,    0,    0,    0]])>, 'input_2': <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[    2,     1,    16, ...,     0,     0,     0],
       [    2,    24,   429, ...,     0,     0,     0],
       [    2,    24,  1483, ...,     0,     0,     0],
       ...,
       [    2, 10485,     0, ...,     0,     0,     0],
       [    2,    14,    15, ...,     0,     0,     0],
       [    2,   255,  6500, ...,     0,     0,     0]])>}, <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[    1,    16,     3, ...,     0,     0,     0],
       [   24,   429,     3, ...,     0,     0,     0],
       [   24,  1483,     3, ...,     0,     0,     0

In [23]:
french_vectorize_layer.get_vocabulary()[104], english_vectorize_layer.get_vocabulary()[44]

('va', 'go')

In [24]:
for i in dataset.take(10):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[44,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 104,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[104,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

In [25]:
dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [26]:
dataset = dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size = tf.data.AUTOTUNE)

In [27]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [28]:
NUM_BATCHES = int(200000/BATCH_SIZE)

In [29]:
train_dataset = dataset.take(int(0.9 * NUM_BATCHES))
val_dataset = dataset.skip(int(0.9 * NUM_BATCHES))

## Modelling

In [32]:
NUM_UNITS = 256

In [33]:
## ENCODER
input = Input(shape = (ENGLISH_SEQUENCE_LENGTH, ), dtype = 'int64', name = 'input_1')
x = Embedding(VOCAB_SIZE, EMBEDDING_DIM, )(input)
encoded_input = Bidirectional(GRU(NUM_UNITS, ))(x)

## DECODER
shifted_target = Input(shape = (FRENCH_SEQUENCE_LENGTH, ), dtype = 'int64', name = 'input_2')
x = Embedding(VOCAB_SIZE, EMBEDDING_DIM, )(shifted_target)
x = GRU(NUM_UNITS * 2, return_sequences = True)(x, initial_state = encoded_input)

## OUTPUT
x = Dropout(0.5)(x)
target = Dense(VOCAB_SIZE, activation = 'softmax')(x)
seq2seq_gru = Model([input, shifted_target], target)
seq2seq_gru.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 64, 300)      6000000     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 64, 300)      6000000     ['input_2[0][0]']                
                                                                                              

In [34]:
seq2seq_gru.compile(optimizer= Adam(learning_rate= 1e-4),
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy']
              )

In [None]:
history = seq2seq_gru.fit(train_dataset, epochs = 15, validation_data = val_dataset)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
seq2seq_gru.save('/content/drive/MyDrive/Deep_Learning/NLP/Neural_Machine_Translation_with_RNN/model')



In [35]:
seq2seq_gru = tf.keras.models.load_model('/content/drive/MyDrive/Deep_Learning/NLP/Neural_Machine_Translation_with_RNN/model')

## Creating an index to word dictionary with the french vocabulary

In [36]:
index_to_word = {x:y for x,y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                    french_vectorize_layer.get_vocabulary())}

### Testing

In [37]:
def translator(english_sentence):
  tokenized_english_sentence = english_vectorize_layer([english_sentence])
  shifted_target = 'starttoken'
  # tokenized_french_sentence = french_vectorize_layer([shifted_target])

  # output = seq2seq_gru.predict([tokenized_english_sentence, tokenized_french_sentence])
  # french_sentence = tf.argmax(output, axis = -1)

  # shifted_target = 'starttoken quels'
  # tokenized_french_sentence = french_vectorize_layer([shifted_target])
  # output = seq2seq_gru.predict([tokenized_english_sentence, tokenized_french_sentence])
  # french_sentence = tf.argmax(output, axis = -1)

  ### upper is the logic
  ## To automate we will use for loop and send outputs next and repeat and stop when we find end token

  for i in range(FRENCH_SEQUENCE_LENGTH):

    tokenized_shifted_target = french_vectorize_layer([shifted_target])

    output = seq2seq_gru.predict([tokenized_english_sentence, tokenized_shifted_target])
    french_word_index = tf.argmax(output, axis = -1)[0][i].numpy()
    current_word = index_to_word[french_word_index]

    if current_word == 'endtoken':
      break
    shifted_target += ' '+ current_word

    # print(shifted_target)

  return shifted_target

In [38]:
### This is one hot representation of 64 words with 20000 words in voacb. we use argmax to select the highest prob words

In [39]:
translator("Roses are red Violet is blue I don't know what else to do")



'starttoken les roses sont [UNK] et je ne lai pas fait'

In [40]:
word_index = {y:x for x,y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                    french_vectorize_layer.get_vocabulary())}

In [41]:
word_index['football']

1181

In [42]:
french_vectorize_layer.get_vocabulary()[918], french_vectorize_layer.get_vocabulary()[49]

('quels', 'sont')

In [43]:
french_vectorize_layer.get_vocabulary()[7]

'que'

## BLEU Score

In [40]:

# class BLEU(tf.keras.metrics.Metric):
#     def __init__(self,name='bleu_score'):
#         super(BLEU,self).__init__()
#         self.bleu_score=0

#     def update_state(self,y_true,y_pred,sample_weight=None):
#       y_pred=tf.argmax(y_pred,-1)
#       self.bleu_score=0
#       for i,j in zip(y_pred,y_true):
#         tf.autograph.experimental.set_loop_options()

#         total_words=tf.math.count_nonzero(i)
#         total_matches=0
#         for word in i:
#           if word==0:
#             break
#           for q in range(len(j)):
#             if j[q]==0:
#               break
#             if word==j[q]:
#               total_matches+=1
#               j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
#               break

#         self.bleu_score+=total_matches/total_words

#     def result(self):
#         return self.bleu_score/BATCH_SIZE

<!--  -->

In [None]:
# seq2seq_gru.compile(
#     loss=tf.keras.losses.SparseCategoricalCrossentropy(),
#     optimizer=tf.keras.optimizers.Adam(5e-4),)
#     #metrics=[BLEU()],
#     #run_eagerly=True)
