Setup: We want these packages installed beforehand

In [58]:
import numpy as np
import tensorflow as tf
import collections
import pandas as pd
import helper
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
!pip install visualkeras
import visualkeras
from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
smoothie = SmoothingFunction().method4



Loading in the smaller data-set

In [23]:
!cp drive/MyDrive/ColabNotebooks/finalproject/small_vocab_en.txt .
!cp drive/MyDrive/ColabNotebooks/finalproject/small_vocab_fr.txt .
# load data
english_sentences = []
french_sentences = []
with open('small_vocab_en.txt') as f:
    for line in f:
      english_sentences.append(line)

with open('small_vocab_fr.txt') as f:
    for line in f:
      french_sentences.append(line)

# print data stats
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

# this code was taken from the data source
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')




1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


Loading in the much bigger europarl data-set.

In [25]:
# load doc into memory
!cp drive/MyDrive/ColabNotebooks/finalproject/europarl-v7.fr-en.en .
!cp drive/MyDrive/ColabNotebooks/finalproject/europarl-v7.fr-en.fr .
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_sentences(doc):
	return doc.strip().split('\n')

# shortest and longest sentence lengths
def sentence_lengths(sentences):
	lengths = [len(s.split()) for s in sentences]
	return min(lengths), max(lengths)

# load English data
filename = 'europarl-v7.fr-en.en'
doc = load_doc(filename)
en_sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(en_sentences)
print('English data: en_sentences=%d, min=%d, max=%d' % (len(en_sentences), minlen, maxlen))

# load French data
filename = 'europarl-v7.fr-en.fr'
doc = load_doc(filename)
fr_sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(fr_sentences)
print('French data: fr_sentences=%d, min=%d, max=%d' % (len(fr_sentences), minlen, maxlen))

English data: en_sentences=2007723, min=0, max=668
French data: fr_sentences=2007723, min=0, max=693


Tokenizing and Preprocessing

In [26]:
# we need to tokenize the data, we will use word id's as 
# we want to predict what a word will be
# a tokenizer converts each word into a word id. 
def tokenize(data):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(data)
  return tokenizer.texts_to_sequences(data), tokenizer

# we need to make sure all the sentences have the same the length, so we pad them
def pad(data, maxlen=None):
  return pad_sequences(data, maxlen=maxlen, padding='post')

# preprocessing is equivalent to first tokenizing, and then padding all the sentences in the data set
# we must also return the tokenizers so we can translate backward
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)

proc_en, proc_fr, en_tokenizer, fr_tokenizer = preprocess(en_sentences[:1000], fr_sentences[:1000])

max_en_sequence_length = proc_en.shape[1]
max_fr_sequence_length = proc_fr.shape[1]
en_vocab_size = len(en_tokenizer.word_index)
fr_vocab_size = len(fr_tokenizer.word_index)

# statistics
print('Big Data Preprocessed')
print("Max English sentence length:", max_en_sequence_length)
print("Max French sentence length:", max_fr_sequence_length)
print("English vocabulary size:", en_vocab_size)
print("French vocabulary size:", fr_vocab_size)

    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

# statistics
print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)




Big Data Preprocessed
Max English sentence length: 116
Max French sentence length: 133
English vocabulary size: 3454
French vocabulary size: 4443
Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


The out_to_txt function, takes in the logits from the model and returns the language representation of those logits based on the tokenizer. 

In [27]:
def out_to_txt(output, tokenizer):
  # turns output from neural network into text using the tokenizer
  index_to_words = {id: word for word, id in tokenizer.word_index.items()}
  index_to_words[0] = '<PAD>'
  return ([index_to_words[prediction] for prediction in np.argmax(output, 1)])


Building model 1, the simple GRU architecture. Please see paper for details. 

Reshaping the data

In [45]:
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x[:, :, np.newaxis]
tmp_en = pad(proc_en, max_fr_sequence_length)
tmp_en = tmp_en[:, :, np.newaxis]

In [46]:
def GRU_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    # Hyperparameters
    learning_rate = 0.005
    
    model = Sequential()
    model.add(GRU(256, input_shape=input_shape[1:], return_sequences=True))
    model.add(GRU(256, return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.25))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

simple_rnn_small = GRU_model(
tmp_x.shape,
max_french_sequence_length,
english_vocab_size,
french_vocab_size)

simple_rnn_large = GRU_model(
tmp_en.shape,
max_fr_sequence_length,
en_vocab_size,
fr_vocab_size)


print(simple_rnn_small.summary())
print(simple_rnn_large.summary())


Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_34 (GRU)                 (None, 21, 256)           198912    
_________________________________________________________________
gru_35 (GRU)                 (None, 21, 256)           394752    
_________________________________________________________________
time_distributed_34 (TimeDis (None, 21, 1024)          263168    
_________________________________________________________________
dropout_17 (Dropout)         (None, 21, 1024)          0         
_________________________________________________________________
time_distributed_35 (TimeDis (None, 21, 344)           352600    
Total params: 1,209,432
Trainable params: 1,209,432
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_18"
_________________________________________________________________
Layer (type)   

Fitting the model, see paper for details

In [48]:
#from sklearn.model_selection import train_test_split
#xgru_train, xgru_test, ygru_train, ygru_test = train_test_split(preproc_english_sentences, preproc_french_sentences, test_size=0.20, random_state=42)
simple_rnn_small.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fdf49a5f190>

Getting the predictions on the data-set from the model.

In [22]:
small_predictions = simple_rnn_small.predict(tmp_x)


Evaluating the simple GRU model

In [33]:
# prints example predictions with the correct ones for comparison
def print_examples(model, data):
  print("Prediction:")
  print(out_to_txt(model.predict(data[:1])[0], french_tokenizer))

  print("\nCorrect Translation:")
  print(french_sentences[:1])

  print("\nOriginal text:")
  print(english_sentences[:1])

print_examples(simple_rnn_small, tmp_x)

# removes the padding when comparing BLEU scores
def remove_pad(predictions):
  new_predictions = []
  for i, prediction in enumerate(predictions):
    new_prediction = []
    for word in prediction:
      if word == '<PAD>':
        break
      new_prediction.append(word)
    new_predictions.append(new_prediction)
  return new_predictions

# turns the logits into actual french sentences
def get_predictions(model, tokenizer, predictions):
  new_predictions = []
  print(predictions[0].shape)
  for i, prediction in enumerate(predictions):
     new_predictions.append(out_to_txt(prediction, tokenizer))
  return new_predictions

# calculates the overall BLEU score of the model
def bleu_score(predictions, originals):
  total = 0
  for i, prediction in enumerate(predictions):
    reference = [prediction]
    candidate = originals[i].split()
    score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
    total += score
  return total / len(predictions) * 100


simple_rnn_predictions = get_predictions(simple_rnn_small, french_tokenizer, small_predictions)
y_predictions = french_sentences
print(simple_rnn_predictions[0])
print(y_predictions[0])

print("this is the bleu score of the simpleRNN model")
print(bleu_score(remove_pad(simple_rnn_predictions), y_predictions))














Prediction:
['new', 'jersey', 'est', 'parfois', 'chaud', 'en', 'cours', 'automne', 'il', 'est', 'il', 'est', 'avril', 'en', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril .\n"]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .\n']
(21, 344)
['new', 'jersey', 'est', 'parfois', 'chaud', 'en', 'cours', 'automne', 'il', 'est', 'il', 'est', 'avril', 'en', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .

this is the bleu score of the simpleRNN model
22.89173961928556


Model 2: Simple enc-dec model

In [49]:
def simple_encdec(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 0.001

    model = Sequential()
    # Encoder
    model.add(GRU(256, input_shape=input_shape[1:], go_backwards=True))
    model.add(RepeatVector(output_sequence_length))
    # Decoder
    model.add(GRU(256, return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model




Data Re-shaping

In [50]:
enc_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
enc_x = enc_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

In [51]:
encdec_simple = encdec(
    enc_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

encdec_simple.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_38 (GRU)                 (None, 256)               198912    
_________________________________________________________________
repeat_vector_11 (RepeatVect (None, 21, 256)           0         
_________________________________________________________________
gru_39 (GRU)                 (None, 21, 256)           394752    
_________________________________________________________________
time_distributed_38 (TimeDis (None, 21, 1024)          263168    
_________________________________________________________________
dropout_19 (Dropout)         (None, 21, 1024)          0         
_________________________________________________________________
time_distributed_39 (TimeDis (None, 21, 345)           353625    
Total params: 1,210,457
Trainable params: 1,210,457
Non-trainable params: 0
___________________________________________

Data fitting, and training

In [44]:
encdec_simple.fit(enc_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa65653d890>

Model 2 evaluation

In [53]:
encdec_predictions = encdec_simple.predict(enc_x)

In [62]:
def get_predictions(model, tokenizer, predictions):
  new_predictions = []
  print(predictions[0].shape)
  for i, prediction in enumerate(predictions):
     new_predictions.append(out_to_txt(prediction, tokenizer))
  return new_predictions

def bleu_score(predictions, originals):
  total = 0
  for i, prediction in enumerate(predictions):
    reference = [prediction]
    candidate = originals[i].split()
    score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
    total += score
  return total / len(predictions) * 100

encdec_predictions = get_predictions(encdec_simple, french_tokenizer, encdec_predictions)
y_predictions = french_sentences



print("this is the bleu score of the simple encdec model")
print(bleu_score((encdec_predictions), y_predictions))

(21, 345)
this is the bleu score of the simple encdec model
0.3233405582777288


Example outputs from model 2

In [64]:
def print_examples(model, data):
  print("Prediction:")
  print(out_to_txt(model.predict(data[:1])[0], french_tokenizer))

  print("\nCorrect Translation:")
  print(french_sentences[:1])

  print("\nOriginal text:")
  print(english_sentences[:1])

print_examples(simple_rnn_small, tmp_x)



Prediction:
['new', 'jersey', 'est', 'parfois', 'calme', 'au', 'mois', 'de', "l'", 'automne', 'il', 'est', 'avril', 'en', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril .\n"]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .\n']


Model 3: Combination encoder-decoder

In [81]:
def combo_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    # Hyperparameters
    learning_rate = 0.003 
    model = Sequential()
    # Embedding
    model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1],
                         input_shape=input_shape[1:]))
    # Encoder
    model.add(Bidirectional(GRU(128)))
    model.add(RepeatVector(output_sequence_length))
    # Decoder
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model



In [82]:
x_train, x_test, y_train, y_test = train_test_split(preproc_english_sentences, preproc_french_sentences, test_size=0.20, random_state=42)

In [84]:
improved_encdec = combo_model(x_train.shape,y_train.shape[1],
                        len(english_tokenizer.word_index)+1,
                        len(french_tokenizer.word_index)+1)

improved_encdec.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 15, 128)           25600     
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 256)               198144    
_________________________________________________________________
repeat_vector_14 (RepeatVect (None, 21, 256)           0         
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 21, 256)           296448    
_________________________________________________________________
time_distributed_44 (TimeDis (None, 21, 512)           131584    
_________________________________________________________________
dropout_22 (Dropout)         (None, 21, 512)           0         
_________________________________________________________________
time_distributed_45 (TimeDis (None, 21, 345)         

In [90]:
improved_encdec.fit(x_train, y_train, batch_size=1024, epochs=25, validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fdc3c0639d0>

Advanced EncDec evaluation

In [116]:
improved_encdec_predictions = improved_encdec.predict(x_test)
def get_predictions(model, tokenizer, predictions):
  new_predictions = []
  for i, prediction in enumerate(predictions):
     new_predictions.append(out_to_txt(prediction, tokenizer))
  return new_predictions

def bleu_score(predictions, originals):
  total = 0
  for i, prediction in enumerate(predictions):
    reference = [prediction]
    candidate = originals[i]
    score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
    total += score
  return total / len(predictions) * 100

def train_to_predictions(predictions):
  new_predictions = []
  y_id_to_word = {value: key for key, value in french_tokenizer.word_index.items()}
  y_id_to_word[0] = '<PAD>'
  for i, prediction in enumerate(predictions):
    new_predictions.append(([y_id_to_word[np.max(x)] for x in prediction]))
  return new_predictions

improved_encdec_predictions = get_predictions(improved_encdec, french_tokenizer, improved_encdec_predictions)
print(improved_encdec_predictions[0])
y_predictions = train_to_predictions(y_test)
print(y_predictions[0])



print("this is the bleu score of the advanced encdec model:")
print(bleu_score((improved_encdec_predictions), y_predictions))


['chine', 'est', 'généralement', 'occupé', 'en', 'septembre', 'mais', 'il', 'est', 'parfois', 'froid', 'au', 'printemps', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['chine', 'est', 'généralement', 'occupé', 'en', 'septembre', 'mais', 'il', 'est', 'parfois', 'froid', 'au', 'printemps', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
this is the bleu score of the advanced encdec model:
39.97476301527965
