In [45]:
import os, sys

from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [49]:
#On each line, the text file contains an English sentence and its French translation, separated by a tab.
f = open('fra.txt', encoding="utf-8")

In [56]:
#f.readlines()

In [202]:
def processing(total_lines=40000):
    '''
    Description:
        process 'fra.txt' file to output the lists of original sentences, 
        translated sentences, formated translated sentences for reinput
    Parameters:
        number of lines to be processed
    Return:
        3 list:
        original_sent: original sentences in list form from the 'fra.txt'
        translated_sent: translated sentences with "<eos>" token at the end
        reinput_sent: translated sences with "<sos>" token at the begining
    '''
    original_sent = list()
    translated_sent = list()
    reinput_sent = list()
    assert(total_lines > 0) and type(total_lines) == int, ValueError("input must be larger than 0, int datatype")
    try:
        with open('fra.txt',encoding='utf-8') as f:
            for i in range(total_lines):
                line = f.readline()
                if '\t' in line:
                    o_sent, t_sent, _ = line.rstrip().split('\t')
                    original_sent.append(o_sent)
                    translated_sent.append(t_sent+" <end>") #end of sentence token. Important for encoder/decoder
                    reinput_sent.append("<start> "+t_sent) #start of sentence token. Important for encoder/decoder
    except:
        raise ("number of lines not provided as parameter or 'fra.txt' doesn't exist")
    return original_sent, translated_sent, reinput_sent
            
original_sent, translated_sent,reinput_sent = processing(total_lines= 50000)

LSTM encoder, LSTM decoder. Seq2Seq architecture.

https://www.guru99.com/seq2seq-model.html (Theory).

Seq2Seq Model is a kind of model that use Encoder and a Decoder on top of the model. The Encoder will encode the sentence word by words into an indexed of vocabulary or known words with index, and the decoder will predict the output of the coded input by decoding the input in sequence and will try to use the last input as the next input if its possible. With this method, it is also possible to predict the next input to create a sentence. Each sentence will be assigned a token to mark the end of the sequence. At the end of prediction, there will also be a token to mark the end of the output. So, from the encoder, it will pass a state to the decoder to predict the output.

In [203]:
print(original_sent[51])
print(translated_sent[51])
print(reinput_sent[51])

I lost.
J'ai perdu. <end>
<start> J'ai perdu.


In [204]:
#tokenization of our words (making a dictionary of word:unique value for LSTM/GRU to use)

def tokenization_input(input_sent,max_num_words=80000,filters=None):
    '''
    Description: 
        tokenization of the unique words for RNN usage. Each word corresponse to a unique value to feed in the encoder
    Parameters:
        original_sent: input for encoder
        num_words: the number of maximum unique words to be encoded
        filters: how to encode, mainly for <start>,<end> tokens
    Return:
        dictionary of word to unique value
    '''
    input_token = Tokenizer(num_words=max_num_words)
    input_token.fit_on_texts(input_sent)
    input_mapped = input_token.texts_to_sequences(input_sent)
    word_indices = input_token.word_index
    print("There are {0} mapped words with the max requested of {1} unique words ".format(len(word_indices),max_num_words))
    return word_indices, input_mapped

word_indices,input_mapped = tokenization_input(original_sent)
print(max([len(i) for i in word_indices])) #maximum length of the words in dict

There are 6009 mapped words with the max requested of 80000 unique words 
15


In [205]:
#input_mapped[:5]
sorted(word_indices.keys())
print(sorted(word_indices.keys(), key=len)[-1]) #longest word in our English dictionary

congratulations


In [211]:
#need the reinput_sent for the <start> token
def tokenization_output(output_sent,reinput_sent,max_num_words=80000):
    '''
    Description:
        **need to check if we should keep filter as a parameter in Tokenizer
        same as tokeniztion_input(), but mainly for the 2 output lists and its dict.
        max_num_words need to be the same as the input tokenization version
    '''
    output_token = Tokenizer(num_words=max_num_words,filters='') #filter to be filtered from text
    output_token.fit_on_texts(output_sent + reinput_sent) #need for both the <start> and <end>
    output_mapped = output_token.texts_to_sequences(output_sent)
    reinput_mapped = output_token.texts_to_sequences(reinput_sent)

    word_indices2 = output_token.word_index
    print("There are {0} mapped words with the max requested of {1} unique words ".format(len(word_indices2),max_num_words))
    return word_indices2, output_mapped, reinput_mapped

word_indices2,output_mapped,reinput_mapped = tokenization_output(translated_sent, reinput_sent,max_num_words=20000)
print(max([len(i) for i in word_indices2])) #longest word in the dict in french

There are 17891 mapped words with the max requested of 20000 unique words 
46


In [210]:
word_indices2

{'<end>': 1,
 '<start>': 2,
 'je': 3,
 '?': 4,
 'pas': 5,
 'de': 6,
 'ne': 7,
 'vous': 8,
 'il': 9,
 'le': 10,
 'est': 11,
 '!': 12,
 'nous': 13,
 'suis': 14,
 'la': 15,
 'à': 16,
 'un': 17,
 'que': 18,
 'tom': 19,
 "c'est": 20,
 "j'ai": 21,
 'tu': 22,
 'a': 23,
 'en': 24,
 'me': 25,
 'une': 26,
 'ce': 27,
 'les': 28,
 'elle': 29,
 'ça': 30,
 'êtes': 31,
 'tout': 32,
 'qui': 33,
 'sont': 34,
 'mon': 35,
 'te': 36,
 'est-ce': 37,
 'fait': 38,
 'très': 39,
 'sommes': 40,
 'des': 41,
 'es': 42,
 'ils': 43,
 "n'est": 44,
 'pour': 45,
 'faire': 46,
 'votre': 47,
 'du': 48,
 'y': 49,
 'pas.': 50,
 'se': 51,
 'veux': 52,
 'peux': 53,
 'elles': 54,
 'été': 55,
 'êtes-vous': 56,
 'comment': 57,
 'ma': 58,
 'personne': 59,
 'pourquoi': 60,
 'besoin': 61,
 'tom.': 62,
 'ton': 63,
 'où': 64,
 'dans': 65,
 'ça.': 66,
 'avec': 67,
 'cette': 68,
 'si': 69,
 'était': 70,
 "l'air": 71,
 'avons': 72,
 'au': 73,
 'cela': 74,
 'être': 75,
 "n'ai": 76,
 'faut': 77,
 'trop': 78,
 'ai': 79,
 'vraiment': 80,


In [208]:
print(sorted(word_indices2.keys(), key=len)[-1]) #longest word in our dictionary. wtf...

je pense que nous nous en sommes bien sorties.


Need to pad the input for [Context Vector] to have a fixed size. Unless we do Attention in seq2seq, this is needed.