<a href="https://colab.research.google.com/github/yahyasungur/nlp_dl_ml_projects/blob/master/en_to_fr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
#%aimport helper, tests
%autoreload 1

In [None]:
import collections

import helper
import numpy as np
#import project_tests as tests
import pandas as pd
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [None]:
def load_data(path):

    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4435678779321353526
]


In [None]:
english_sentences = load_data('small_vocab_en')
french_sentences = load_data('small_vocab_fr')

In [None]:
for i in range(5):
  print("en:  ",english_sentences[i+1])
  print("fr:  ",french_sentences[i+1],'\n')

en:   the united states is usually chilly during july , and it is usually freezing in november .
fr:   les états-unis est généralement froid en juillet , et il gèle habituellement en novembre . 

en:   california is usually quiet during march , and it is usually hot in june .
fr:   california est généralement calme en mars , et il est généralement chaud en juin . 

en:   the united states is sometimes mild during june , and it is cold in september .
fr:   les états-unis est parfois légère en juin , et il fait froid en septembre . 

en:   your least liked fruit is the grape , but my least liked is the apple .
fr:   votre moins aimé fruit est le raisin , mais mon moins aimé est la pomme . 

en:   his favorite fruit is the orange , but my favorite is the grape .
fr:   son fruit préféré est l'orange , mais mon préféré est le raisin . 



In [None]:
#Tokenization

def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [None]:
#Padding

def pad(x, length=None):
    return pad_sequences(x, maxlen=length, padding='post')

In [None]:
def preprocess(x, y):

    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

In [None]:
def logits_to_text(logits, tokenizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
#Custom model (Final version)

def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

  learning_rate = 0.003

  model = Sequential()
  
  #embedding
  model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1], input_shape= input_shape[1:]))
  
  #encoder
  model.add(Bidirectional(GRU(128)))
  model.add(RepeatVector(output_sequence_length))

  #decoder
  model.add(Bidirectional(GRU(128, return_sequences=True)))
  model.add(TimeDistributed(Dense(512, activation='relu')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
  
  #compile
  model.compile(loss= sparse_categorical_crossentropy, optimizer= Adam(learning_rate), metrics=['accuracy'])

  return model

In [None]:
#Reshape the input
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

In [None]:
model = model_final(preproc_english_sentences.shape, preproc_french_sentences.shape[1], len(english_tokenizer.word_index)+1, len(french_tokenizer.word_index)+1)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 128)           25600     
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               198144    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 21, 256)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 21, 256)           296448    
_________________________________________________________________
time_distributed (TimeDistri (None, 21, 512)           131584    
_________________________________________________________________
dropout (Dropout)            (None, 21, 512)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 21, 345)           1

In [14]:
#Train
model.fit(preproc_english_sentences, preproc_french_sentences, batch_size=1024, epochs=25, validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7ff3fbbf9d50>

In [15]:
x = preproc_english_sentences
y = preproc_french_sentences
x_tk = english_tokenizer
y_tk = french_tokenizer

In [16]:
y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
y_id_to_word[0] = '<PAD>'

In [17]:
def translate(sent):
  sentence = sent
  sentence = [x_tk.word_index[word] for word in sentence.split()]
  sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
  sentences = np.array([sentence[0], x[0]])
  predictions = model.predict(sentences, len(sentences))
  output = ' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]])
  return output

In [18]:
print(translate('he saw a old yellow truck'))

il a vu un vieux camion jaune <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
