<a href="https://colab.research.google.com/github/umar-ali-k/Machine-Translation/blob/master/Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Translate French sentences into English sentences

## Import necessary packages

In [0]:

%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
tf.__version__

import numpy as np
import math


## Install tensorflow 1.5

In [0]:
pip install tensorflow==1.5


# Import all the necessary things which would be required

In [0]:
# from tf.keras.models import Model  
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

## Get the dataset

In [2]:
import os
work_dir = "/content/TensorFlow-Tutorials/"
if os.getcwd() != work_dir:
  !git clone https://github.com/Hvass-Labs/TensorFlow-Tutorials
os.chdir(work_dir)

Cloning into 'TensorFlow-Tutorials'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects:  11% (1/9)[Kremote: Counting objects:  22% (2/9)[Kremote: Counting objects:  33% (3/9)[Kremote: Counting objects:  44% (4/9)[Kremote: Counting objects:  55% (5/9)[Kremote: Counting objects:  66% (6/9)[Kremote: Counting objects:  77% (7/9)[Kremote: Counting objects:  88% (8/9)[Kremote: Counting objects: 100% (9/9)[Kremote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 480 (delta 2), reused 5 (delta 2), pack-reused 471[K
Receiving objects: 100% (480/480), 62.52 MiB | 31.49 MiB/s, done.
Resolving deltas: 100% (223/223), done.


Import europarl.py file

In [0]:
import europarl

## Language Code is 'fr' which means we are going to translate french sentences into english sentence 

In [0]:

language_code='fr'

In [0]:

mark_start = 'ssss '
mark_end = ' eeee'

## This will automatically download and extract the data-files if you don't have them already.

In [5]:
europarl.maybe_download_and_extract(language_code=language_code)

- Download progress: 100.0%
Download finished. Extracting files.
Done.


## Load the texts for the source-language, here we use French.

In [0]:
data_src = europarl.load_data(english=False,
                              language_code=language_code)

## Load the texts for the destination-language, here we use English.

In [0]:

data_dest = europarl.load_data(english=True,
                               language_code=language_code,
                               start=mark_start,
                               end=mark_end)

# We will build a model to translate from the source language (French) to the destination language (English). If you want to make the inverse translation you can merely exchange the source and destination data.

## Example Data


In [10]:
idx = 2
data_src[idx]
#This will be our french sentence

'Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles.'

In [11]:
data_dest[idx]
#This will be our english sentence

"ssss Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful. eeee"

# Tokenizer
Neural Networks cannot work directly on text-data. We use a two-step process to convert text into numbers that can be used in a neural network. The first step is to convert text-words into so-called integer-tokens. The second step is to convert integer-tokens into vectors of floating-point numbers using a so-called embedding-layer. See Tutorial #20 for a more detailed explanation.

In [0]:
#Set the maximum number of words in our vocabulary. This means that we will only use e.g. the 10000 most frequent words in the data-set. We use the same number for both the source and destination languages, but these could be different.
num_words = 10000

In [0]:
import numpy as np
class TokenizerWrap(Tokenizer):
    
    
    def __init__(self, texts, padding,
                 reverse=False, num_words=None):
      
       

        Tokenizer.__init__(self, num_words=num_words)

        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)

        # Create inverse lookup from integer-tokens to words.
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))

        # Convert all texts to lists of integer-tokens.
        # Note that the sequences may have different lengths.
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            # Reverse the token-sequences.
            self.tokens = [list(reversed(x)) for x in self.tokens]
        
            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        # Max number of tokens to use in all sequences.
        # We will pad / truncate all sequences to this length.
        # This is a compromise so we save a lot of memory and
        # only have to truncate maybe 5% of all the sequences.
        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        # Pad / truncate all token-sequences to the given length.
        # This creates a 2-dim numpy matrix that is easier to use.
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)

    def token_to_word(self, token):
        """Lookup a single word from an integer-token."""

        word = " " if token == 0 else self.index_to_word[token]
        return word 

    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""

        # Create a list of the individual words.
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text
    
    def text_to_tokens(self, text, reverse=False, padding=False):
       

        # Convert to tokens. Note that we assume there is only
        # a single text-string so we wrap it in a list.
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            # Reverse the tokens.
            tokens = np.flip(tokens, axis=1)

            
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        if padding:
            # Pad and truncate sequences to the given length.
            tokens = pad_sequences(tokens,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return tokens

Now create a tokenizer for the source-language. Note that we pad zeros at the beginning ('pre') of the sequences. We also reverse the sequences of tokens because the research literature suggests that this might improve performance, because the last words seen by the encoder match the first words produced by the decoder, so short-term dependencies are supposedly modelled more accurately.

In [16]:
%%time
tokenizer_src = TokenizerWrap(texts=data_src,
                              padding='pre',
                              reverse=True,
                              num_words=num_words)

CPU times: user 2min 3s, sys: 1.52 s, total: 2min 5s
Wall time: 2min 5s


Now create the tokenizer for the destination language. We need a tokenizer for both the source- and destination-languages because their vocabularies are different. Note that this tokenizer does not reverse the sequences and it pads zeros at the end ('post') of the arrays.

In [17]:
%%time
tokenizer_dest = TokenizerWrap(texts=data_dest,
                               padding='post',
                               reverse=False,
                               num_words=num_words)

CPU times: user 1min 23s, sys: 1.61 s, total: 1min 25s
Wall time: 1min 25s


In [18]:
tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape)
print(tokens_dest.shape)

(2007723, 55)
(2007723, 56)


This is the integer-token used to mark the beginning of a text in the destination-language.

In [19]:
token_start = tokenizer_dest.word_index[mark_start.strip()]
token_start

2

This is the integer-token used to mark the end of a text in the destination-language.

In [21]:
token_end = tokenizer_dest.word_index[mark_end.strip()]
token_end

3

## Training Data
Now that the data-set has been converted to sequences of integer-tokens that are padded and truncated and saved in numpy arrays, we can easily prepare the data for use in training the neural network.

In [0]:
encoder_input_data = tokens_src


The input and output data for the decoder is identical, except shifted one time-step. We can use the same numpy array to save memory by slicing it, which merely creates different 'views' of the same data in memory.

In [24]:
decoder_input_data = tokens_dest[:, :-1]
decoder_input_data.shape

(2007723, 55)

In [25]:
decoder_output_data = tokens_dest[:, 1:]
decoder_output_data.shape

(2007723, 55)

# Create the Neural Network

# Create the Encoder
First we create the encoder-part of the neural network which maps a sequence of integer-tokens to a "thought vector". We will use the so-called functional API of Keras for this, where we first create the objects for all the layers of the neural network and then we connect them later, this allows for more flexibility than the so-called sequential API in Keras, which is useful when experimenting with more complicated architectures and ways of connecting the encoder and decoder.

This is the input for the encoder which takes batches of integer-token sequences. The None indicates that the sequences can have arbitrary length.

In [0]:
encoder_input = Input(shape=(None, ), name='encoder_input')

In [0]:
embedding_size = 128

encoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='encoder_embedding')
state_size = 512

This creates the 3 GRU layers that will map from a sequence of embedding-vectors to a single "thought vector" which summarizes the contents of the input-text. Note that the last GRU-layer does not return a sequence.

In [0]:
encoder_gru1 = GRU(state_size, name='encoder_gru1',
                   return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2',
                   return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3',
                   return_sequences=False)

In [0]:
#This helper-function connects all the layers of the encoder

def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input
    
    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)

    # This is the output of the encoder.
    encoder_output = net
    
    return encoder_output

In [31]:
encoder_output = connect_encoder()

Instructions for updating:
keep_dims is deprecated, use keepdims instead


## Create the Decoder
Create the decoder-part which maps the "thought vector" to a sequence of integer-tokens.

The decoder takes two inputs. First it needs the "thought vector" produced by the encoder which summarizes the contents of the input-text.

In [0]:
decoder_initial_state = Input(shape=(state_size,),
                              name='decoder_initial_state')

In [0]:
decoder_input = Input(shape=(None, ), name='decoder_input')

In [0]:
decoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')

This creates the 3 GRU layers of the decoder. Note that they all return sequences because we ultimately want to output a sequence of integer-tokens that can be converted into a text-sequence.



In [0]:
decoder_gru1 = GRU(state_size, name='decoder_gru1',
                   return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',
                   return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',
                   return_sequences=True)

In [0]:
decoder_dense = Dense(num_words,
                      activation='linear',
                      name='decoder_output')

In [0]:

def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)
    
    # Connect all the GRU-layers.
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output

## Connect and Create the Models

In [0]:
decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

## Loss Function

In [0]:
def sparse_cross_entropy(y_true, y_pred):
  

    
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

  
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

## Compile the Training Model

In [0]:
optimizer = RMSprop(lr=1e-3)

In [0]:
decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

In [50]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


## Train the Model

In [0]:
x_data = \
{
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

In [0]:
y_data = \
{
    'decoder_output': decoder_output_data
}

In [53]:
#We want a validation-set of 10000 sequences but Keras needs this number as a fraction.
validation_split = 10000 / len(encoder_input_data)
validation_split

0.004980766769121039

In [0]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=512,
                epochs=7,
                validation_split=validation_split)

# Translate Texts

In [0]:
def translate(input_text, true_output_text=None):
    

  
    input_tokens = tokenizer_src.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding=True)
    
    .
    initial_state = model_encoder.predict(input_tokens)

    # Max number of tokens / words in the output sequence.
    max_tokens = tokenizer_dest.max_tokens

    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    # The first input-token is the special start-token for 'ssss '.
    token_int = token_start

    # Initialize an empty output-text.
    output_text = ''

    # Initialize the number of tokens we have processed.
    count_tokens = 0

   
    while token_int != token_end and count_tokens < max_tokens:
        
        decoder_input_data[0, count_tokens] = token_int

        # Wrap the input-data in a dict for clarity and safety,
        # so we are sure we input the data in the right order.
        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }

       

        # Input this data to the decoder and get the predicted output.
        decoder_output = model_decoder.predict(x_data)

        # Get the last predicted token as a one-hot encoded array.
        token_onehot = decoder_output[0, count_tokens, :]
        
        # Convert to an integer-token.
        token_int = np.argmax(token_onehot)

        # Lookup the word corresponding to this integer-token.
        sampled_word = tokenizer_dest.token_to_word(token_int)

        # Append the word to the output-text.
        output_text += " " + sampled_word

        # Increment the token-counter.
        count_tokens += 1

    # Sequence of tokens output by the decoder.
    output_tokens = decoder_input_data[0]
    
    # Print the input-text.
    print("Input text:")
    print(input_text)
    print()

    # Print the translated output-text.
    print("Translated text:")
    print(output_text)
    print()

    # Optionally print the true translated text.
    if true_output_text is not None:
        print("True output text:")
        print(true_output_text)
        print()