In [0]:
from google.colab import drive
drive.mount('/content/drive')

data_loc = '/content/drive/My Drive/data_cs583_seq2seq'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Home 4: Build a seq2seq model for machine translation.

### Name: Weronika Zamlynny

### Task: Translate English to Polish

## 0. You will do the following:

1. Read and run my code.
2. Complete the code in Section 1.1 and Section 4.2.

    * Translation English to **German** is not acceptable!!! Try another language.
    
3. **Make improvements.** Directly modify the code in Section 3. Do at least one of the followings. By doing more, you will get up to 2 bonus scores to the total.

    * Bi-LSTM instead of LSTM
    
    * Multi-task learning (e.g., both English to French and English to Spanish)
    
    * Attention
    
4. Evaluate the translation using the BLEU score. 

    * Optional. Up to 2 bonus scores to the total.
    
5. Convert the notebook to .HTML file. 

    * The HTML file must contain the code and the output after execution.

6. Put the .HTML file in your own Github repo. 

7. Submit the link to the HTML file to Canvas

    * E.g., https://github.com/wangshusen/CS583A-2019Spring/blob/master/homework/HM4/seq2seq.html
    


## 1. Data preparation

1. Download data (e.g., "deu-eng.zip") from http://www.manythings.org/anki/
2. Unzip the .ZIP file.
3. Put the .TXT file (e.g., "deu.txt") in the directory "./Data/".

In [0]:
import re
import string
from unicodedata import normalize
import numpy

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

# THIS HAS AN ISSUE WITH POLISH CHARACTERS
def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # THIS CAUSES WORDS TO LOSE IMPORTANT CHARACTERS!
            # normalize unicode characters
#             line = normalize('NFD', line).encode('ascii', 'ignore')          
#             line = line.decode('UTF-8')
            
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            
            # remove non-printable chars form each token
#             line = [re_print.sub('', w) for w in line]
            
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

### 1.1. Load and clean text


#### Fill the following blanks:

Wanted to use languages in the same family as polish (Slavic Languages), but many use the Cyrillic alphabet which would complicated encodings. Tried loading in Sloviakian, but the data set only had 600 entries, which limits its usefulness in training (Would have to remove many terms from the other languages to have a consistent set of translations. 

In [0]:
# e.g., filename = 'Data/deu.txt'
filename_pol = data_loc + '/pol.txt' # Polish
filename_fra = data_loc + '/fra.txt' # French
filename_spa = data_loc + '/spa.txt' # Spanish

In [0]:
# def load_clean_text(filename):
#   # load dataset
#   doc = load_doc(filename)

#   # split into Language1-Language2 pairs
#   pairs = to_pairs(doc)

#   # clean sentences
#   clean_pairs = clean_data(pairs) #[0:n_train, :]
  
#   len_data = len(pairs) # This is how much data we have
#   print("Total number of data points:", len_data)
    
#   # Sample of read in data
#   for i in range(3000, 3010):
#     print('[' + clean_pairs[i, 0] + '] => [' + clean_pairs[i, 1] + ']')
    
#   # Create input_texts and target_texts
#   input_texts = clean_pairs[:, 0]
#   target_texts = numpy.array(['\t' + text + '\n' for text in clean_pairs[:, 1]])

#   print('Length of input_texts:  ' + str(input_texts.shape))
#   print('Length of target_texts: ' + str(input_texts.shape))
  
#   # define max sequence lengths
#   max_encoder_seq_length = max(len(line) for line in input_texts)
#   max_decoder_seq_length = max(len(line) for line in target_texts)

#   print('max length of input  sentences: %d' % (max_encoder_seq_length))
#   print('max length of target sentences: %d' % (max_decoder_seq_length))
  

In [0]:
# load dataset
doc_pol = load_doc(filename_pol)
doc_fra = load_doc(filename_fra)
doc_spa = load_doc(filename_spa)

# split into Language1-Language2 pairs
pairs_pol = to_pairs(doc_pol)
pairs_fra = to_pairs(doc_fra)
pairs_spa = to_pairs(doc_spa)

# clean sentences
clean_pairs_pol = clean_data(pairs_pol) #[0:n_train, :]
clean_pairs_fra = clean_data(pairs_fra)
clean_pairs_spa = clean_data(pairs_spa)

In [0]:
len_data = len(pairs_pol) # This is how much data we have
print("Total number of data points (pol):", len_data)

print("Total number of data points (fra):", len(pairs_fra)) # fra
print("Total number of data points (spa):", len(pairs_spa)) # spa

Total number of data points (pol): 37426
Total number of data points (fra): 167130
Total number of data points (spa): 120614


In [0]:
for i in range(3000, 3010):
    print('[' + clean_pairs_pol[i, 0] + '] => [' + clean_pairs_pol[i, 1] + ']')

[please sit down] => [usiądź proszę]
[please sit down] => [proszę usiądź]
[please tell me] => [proszę powiedz mi]
[raise your hand] => [podnieś rękę]
[remove your hat] => [zdejmij kapelusz]
[school bores me] => [szkoła mnie nudzi]
[school bores me] => [nudzę się w szkole]
[see you about] => [do zobaczenia około siódmej]
[sharks eat fish] => [rekiny jedzą ryby]
[she admired him] => [ona go podziwiała]


In [0]:
for i in range(500, 510):
    print('[' + clean_pairs_fra[i, 0] + '] => [' + clean_pairs_fra[i, 1] + ']')

[come over] => [viens chez nous]
[come over] => [venez chez nous]
[come over] => [viens chez moi]
[come over] => [venez chez moi]
[come soon] => [viens bientôt]
[come soon] => [venez bientôt]
[cool down] => [calmezvous]
[did i win] => [aije gagné]
[did i win] => [laije emporté]
[did i win] => [estce moi qui ai gagné]


In [0]:
for i in range(3000, 3010):
    print('[' + clean_pairs_spa[i, 0] + '] => [' + clean_pairs_spa[i, 1] + ']')

[im prepared] => [estoy preparada]
[im rational] => [soy racional]
[im relieved] => [me siento aliviado]
[im restless] => [soy inquieto]
[im restless] => [estoy desasosegado]
[im restless] => [soy intranquilo]
[im shopping] => [estoy comprando]
[im sleeping] => [estoy durmiendo]
[im so alone] => [estoy tan solo]
[im so happy] => [soy tan feliz]


In [0]:
# # Convert the other languages to dictionaries to easily check for translations
# # this can't handle mutliple definitions...
# dict_fra = {pair[0]: pair[1] for pair in clean_pairs_fra}
# dict_spa = {pair[0]: pair[1] for pair in clean_pairs_spa}

In [0]:
# # TODO ONLY KEEP TERMS IN ALL LANGUAGES
# # count the overlap
# count_overlap = 0

# for i in range(len(clean_pairs_pol)):
#   if clean_pairs_pol[i, 0] in dict_fra and clean_pairs_pol[i, 0] in dict_spa:
# #     print(clean_pairs_pol[i,0])
#     count_overlap += 1;
# print("Overlap:", count_overlap)

Overlap: 8465


In [0]:
input_texts = clean_pairs_pol[:, 0]
target_texts = numpy.array(['\t' + text + '\n' for text in clean_pairs_pol[:, 1]])

print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))

Length of input_texts:  (37426,)
Length of target_texts: (37426,)


In [0]:
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 255
max length of target sentences: 222


**Remark:** To this end, you have two lists of sentences: input_texts and target_texts

## 2. Text processing

### 2.1. Convert texts to sequences

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# encode and pad sequences
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index


encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, 
                                                      input_texts)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, 
                                                       target_texts)

print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

Using TensorFlow backend.


shape of encoder_input_seq: (37426, 255)
shape of input_token_index: 30
shape of decoder_input_seq: (37426, 222)
shape of target_token_index: 40


In [0]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 31
num_decoder_tokens: 41


**Remark:** To this end, the input language and target language texts are converted to 2 matrices. 

- Their number of rows are both n_train.
- Their number of columns are respective max_encoder_seq_length and max_decoder_seq_length.

The followings print a sentence and its representation as a sequence.

In [0]:
target_texts[100]

'\todsuń się\n'

In [0]:
decoder_input_seq[100, :]

array([11,  4, 17, 10, 22, 34,  1, 10,  3, 24, 12,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0], dtype=int32)

## 2.2. One-hot encode

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.
- It is represented by a $n\times t \times v$ tensor ($t$ is the number of unique chars) after the one-hot encoding.

In [0]:
from keras.utils import to_categorical

# one hot encode target sequence
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)

decoder_target_seq = numpy.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq, 
                                    max_decoder_seq_length, 
                                    num_decoder_tokens)

print(encoder_input_data.shape)
print(decoder_input_data.shape)

(37426, 255, 31)
(37426, 222, 41)


### Random train / valid / test split

In [0]:
# Create a random mask that can be used to split each part of data
msk = numpy.random.rand(len_data)

train_msk = (msk <= 0.6)
valid_msk = numpy.logical_and(msk > 0.6, msk <= 0.8)
test_msk = (msk > 0.8)

del msk

In [0]:
train_encoder_input_data = encoder_input_data[train_msk,:,:]
valid_encoder_input_data = encoder_input_data[valid_msk,:,:]
test_encoder_input_data = encoder_input_data[test_msk,:,:]

del encoder_input_data


train_decoder_input_data = decoder_input_data[train_msk,:,:]
valid_decoder_input_data = decoder_input_data[valid_msk,:,:]
test_decoder_input_data = decoder_input_data[test_msk,:,:]

del decoder_input_data

train_decoder_target_data = decoder_target_data[train_msk,:,:]
valid_decoder_target_data = decoder_target_data[valid_msk,:,:]
test_decoder_target_data = decoder_target_data[test_msk,:,:]

del decoder_target_data

train_input_texts = input_texts[train_msk]
valid_input_texts = input_texts[valid_msk]
test_input_texts = input_texts[test_msk]

del input_texts

train_target_texts = target_texts[train_msk]
valid_target_texts = target_texts[valid_msk]
test_target_texts = target_texts[test_msk]

del target_texts

print('shape of train_encoder_input_seq', train_encoder_input_data.shape)
print('shape of valid_encoder_input_seq', valid_encoder_input_data.shape)
print('shape of test_encoder_input_seq', test_encoder_input_data.shape)

print('shape of train_decoder_input_seq', train_decoder_input_data.shape)
print('shape of valid_decoder_input_seq', valid_decoder_input_data.shape)
print('shape of test_decoder_input_seq', test_decoder_input_data.shape)

shape of train_encoder_input_seq (22335, 255, 31)
shape of valid_encoder_input_seq (7465, 255, 31)
shape of test_encoder_input_seq (7626, 255, 31)
shape of train_decoder_input_seq (22335, 222, 41)
shape of valid_decoder_input_seq (7465, 222, 41)
shape of test_decoder_input_seq (7626, 222, 41)


## 3. Build the networks (for training)

- Build encoder, decoder, and connect the two modules to get "model". 

- Fit the model on the bilingual data to train the parameters in the encoder and decoder.

### 3.1. Encoder network

- Input:  one-hot encode of the input language

- Return: 

    -- output (all the hidden states   $h_1, \cdots , h_t$) are always discarded
    
    -- the final hidden state  $h_t$
    
    -- the final conveyor belt $c_t$

In [0]:
from keras.layers import Input, LSTM, Bidirectional, Concatenate
from keras.models import Model

latent_dim = 256

# inputs of the encoder network
encoder_inputs = Input(shape=(None, num_encoder_tokens), 
                       name='encoder_inputs')

# # set the LSTM layer
# encoder_lstm = LSTM(latent_dim, return_state=True, 
#                     dropout=0.5, name='encoder_lstm')
# _, state_h, state_c = encoder_lstm(encoder_inputs)

encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, 
                                  dropout=0.5, name='encoder_lstm'))
_, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)

state_h = Concatenate(name="state_h")([forward_h, backward_h])
state_c = Concatenate(name="state_c")([forward_c, backward_c])

# build the encoder network model
encoder_model = Model(inputs=encoder_inputs, 
                      outputs=[state_h, state_c],
                      name='encoder')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Print a summary and save the encoder network structure to "./encoder.pdf"

In [0]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(encoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=encoder_model, show_shapes=False,
    to_file= data_loc+'/encoder.pdf'
)

encoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None, 31)     0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 512), (None, 589824      encoder_inputs[0][0]             
__________________________________________________________________________________________________
state_h (Concatenate)           (None, 512)          0           bidirectional_1[0][1]            
                                                                 bidirectional_1[0][3]            
__________________________________________________________________________________________________
state_c (Concatenate)           (None, 512)          0           bidirectional_1[0][2]            
          

### 3.2. Decoder network

- Inputs:  

    -- one-hot encode of the target language
    
    -- The initial hidden state $h_t$ 
    
    -- The initial conveyor belt $c_t$ 

- Return: 

    -- output (all the hidden states) $h_1, \cdots , h_t$

    -- the final hidden state  $h_t$ (discarded in the training and used in the prediction)
    
    -- the final conveyor belt $c_t$ (discarded in the training and used in the prediction)

In [0]:
from keras.layers import Input, LSTM, Dense
from keras.models import Model

# inputs of the decoder network
decoder_input_h = Input(shape=(latent_dim*2,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim*2,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# set the LSTM layer
decoder_lstm = LSTM(latent_dim*2, return_sequences=True, 
                    return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, 
                      initial_state=[decoder_input_h, decoder_input_c])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

Print a summary and save the encoder network structure to "./decoder.pdf"

In [0]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(decoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=decoder_model, show_shapes=False,
    to_file= data_loc + '/decoder.pdf'
)

decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input_x (InputLayer)    (None, None, 41)     0                                            
__________________________________________________________________________________________________
decoder_input_h (InputLayer)    (None, 512)          0                                            
__________________________________________________________________________________________________
decoder_input_c (InputLayer)    (None, 512)          0                                            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 512),  1134592     decoder_input_x[0][0]            
                                                                 decoder_input_h[0][0]            
          

### 3.3. Connect the encoder and decoder

In [0]:
# input layers
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x], 
              outputs=decoder_pred, 
              name='model_training')

In [0]:
print(state_h)
print(decoder_input_h)

Tensor("decoder_lstm/while/Exit_2:0", shape=(?, 512), dtype=float32)
Tensor("decoder_input_h_1:0", shape=(?, 512), dtype=float32)


In [0]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=model, show_shapes=False,
    to_file= data_loc + '/model_training.pdf'
)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_x (InputLayer)    (None, None, 31)     0                                            
__________________________________________________________________________________________________
decoder_input_x (InputLayer)    (None, None, 41)     0                                            
__________________________________________________________________________________________________
encoder (Model)                 [(None, 512), (None, 589824      encoder_input_x[0][0]            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 512),  1134592     decoder_input_x[0][0]            
                                                                 encoder[1][0]                    
          

### 3.5. Fit the model on the bilingual dataset

- encoder_input_data: one-hot encode of the input language

- decoder_input_data: one-hot encode of the input language

- decoder_target_data: labels (left shift of decoder_input_data)

- tune the hyper-parameters

- stop when the validation loss stop decreasing.

In [0]:
print('shape of encoder_input_data' + str(train_encoder_input_data.shape))
print('shape of decoder_input_data' + str(train_decoder_input_data.shape))
print('shape of decoder_target_data' + str(train_decoder_target_data.shape))

shape of encoder_input_data(22335, 255, 31)
shape of decoder_input_data(22335, 222, 41)
shape of decoder_target_data(22335, 222, 41)


In [0]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.fit([train_encoder_input_data, train_decoder_input_data],  # training data
          train_decoder_target_data,                       # labels (left shift of the target sequences)
          batch_size=64, epochs=10, 
          validation_data=[[valid_encoder_input_data, valid_decoder_input_data],
                           valid_decoder_target_data])
# TODO change back to 50

model.save('seq2seq.h5')

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 22335 samples, validate on 7465 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  '. They will not be included '


## 4. Make predictions


### 4.1. Translate English to XXX

1. Encoder read a sentence (source language) and output its final states, $h_t$ and $c_t$.
2. Take the [star] sign "\t" and the final state $h_t$ and $c_t$ as input and run the decoder.
3. Get the new states and predicted probability distribution.
4. sample a char from the predicted probability distribution
5. take the sampled char and the new states as input and repeat the process (stop if reach the [stop] sign "\n").

In [0]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [0]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = numpy.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = numpy.argmax(output_tokens[0, -1, :])
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
 
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = numpy.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence


In [0]:
for seq_index in range(2100, 2120):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = valid_encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English:       ', valid_input_texts[seq_index])
    print('Polish (true): ', valid_target_texts[seq_index][1:-1])
    print('Polish (pred): ', decoded_sentence[0:-1])

-
English:        he found me a nice tie
Polish (true):  znalazł dla mnie przyjemny krawat
Polish (pred):  powiedziałem że to jest ten zamierzy
-
English:        he ignores my problems
Polish (true):  on ignoruje moje problemy
Polish (pred):  on jest za stary za stanie
-
English:        he is a very smart boy
Polish (true):  on jest bardzo bystrym chłopcem
Polish (pred):  jest ten za stary po francusku
-
English:        he is an office worker
Polish (true):  on jest pracownikiem biurowym
Polish (pred):  on jest za stary za stanie
-
English:        he is speaking english
Polish (true):  on mówi po angielsku
Polish (pred):  on jest za mnie za mnie za mnie
-
English:        he isnt afraid to die
Polish (true):  nie boi się śmierci
Polish (pred):  on jest za stary za stanie
-
English:        he jumped on the train
Polish (true):  wskoczył do pociągu
Polish (pred):  powiedziałem że to jest ten zamierzy
-
English:        he learned how to swim
Polish (true):  nauczył się jak pływać
Polish (p

### 4.2. Translate an English sentence to the target language

1. Tokenization
2. One-hot encode
3. Translate

In [0]:
input_sentence = 'why is that'
input_sentence_polish = "dlaczego"

input_sequence_length = len(input_sentence);
input_sequence, input_token_index = text2sequences(input_sequence_length, [input_sentence]) # TODO

input_x = onehot_encode([input_sequence], input_sequence_length, num_encoder_tokens) # TODO

translated_sentence = decode_sequence([input_x]) # TODO

print('source sentence is: ' + input_sentence)
print('translated sentence is: ' + translated_sentence)

source sentence is: why is that
translated sentence is: nie mam wiele



## 5. Evaluate the translation using BLEU score

Reference: 
- https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
- https://en.wikipedia.org/wiki/BLEU


**Hint:** Randomly partition the dataset to training, validation, and test. Evaluate the BLEU score using the test set.

In [0]:
from nltk.translate import bleu_score 
# has sentence_bleu and corpus_blue, use corpus on the test set
sentence_score = bleu_score.sentence_bleu(input_sentence_polish, translated_sentence)
print("Sentence Bleu:", sentence_score)

Sentence Bleu: 0.6803749333171202


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


### Translate the test set

In [0]:
test_translated = [decode_sequence(test_encoder_input_data[seq_index: seq_index + 1]) for seq_index in range(len(test_encoder_input_data))]

In [0]:
for seq_index in range(100, 105):
    input_seq = test_encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = test_translated[seq_index: seq_index + 1]
    print('-')
    print('English:       ', test_input_texts[seq_index])
    print('Polish (true): ', test_target_texts[seq_index][1:-1])
    print('Polish (pred): ', decoded_sentence[0])

-
English:        tom cheats
Polish (true):  tom oszukuje
Polish (pred):  tom powiedział mary że jestem zajęty

-
English:        tom is shy
Polish (true):  tom jest nieśmiały
Polish (pred):  tom jest mary przyjacielem

-
English:        tom jumped
Polish (true):  tom skoczył
Polish (pred):  tom powiedział mary że to jest toma

-
English:        tom sighed
Polish (true):  tom westchnął
Polish (pred):  tom jest mary przyjacielem

-
English:        were safe
Polish (true):  jesteśmy bezpieczni
Polish (pred):  jesteśmy przyjacielem



### Final Corpus Bleu Score

In [0]:
corpus_score = bleu_score.corpus_bleu(test_target_texts, test_translated)
print("Corpus Bleu:", corpus_score)

Corpus Bleu: 0.7629149920793389


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
