### https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html3

In [14]:
import tensorflow
#from tensorflow.keras.model import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np
#from tensorflow.keras.applications.vgg19 import VGG19

batch_size = 64 # Batch size of training set
epoch = 100 # No. of epoch
latent_dim = 256 # latent dimensionality of encoding space
num_samples = 10000 # No. of samples to train on
data_path = 'fra-eng/fra.txt'

In [13]:
print(tensorflow.__version__)

2.4.0


In [25]:
#Vectorize the data
input_texts = []
target_texts = []
input_characters = set()
output_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[:min(num_samples, len(lines)-1)]:
    input_text, target_text, _ = line.split('\t')
    #print('_', _) # returns whole row
    #print(input_text)
    #print(target_text)
    # We use "tab" as 'start sequence' character for targets and \n as 'end sequence' character
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in output_characters:
            output_characters.add(char)

In [33]:
print(len(output_characters))
print('****** input_characters *******')
print(input_characters)
print('****** output_characters *******')
print(output_characters)
print('****** input_texts *******')
print(input_texts)
print('****** target_texts *******')
print(target_texts)

93
****** input_characters *******
{'.', 'H', 'h', 'b', 'V', '-', 'i', 'N', 'l', '3', 'P', 'E', 'e', 'z', '1', '9', '0', 'B', 'W', 'I', 'm', 'g', '?', 'L', ',', '2', 'A', 'R', 'S', 'G', 'u', 'o', 't', 'Y', ':', '7', 'q', 'D', 'T', '&', 's', 'p', 'a', '"', 'é', 'y', 'M', 'U', "'", 'r', 'F', '!', 'f', 'J', '5', '%', 'Q', 'v', '8', ' ', 'w', 'j', 'n', 'x', 'c', 'k', 'C', 'K', 'O', '$', 'd'}
****** output_characters *******
{'Ç', 'É', ')', '\n', '’', 'm', 'g', '?', 'û', 'A', 'S', 'R', 'u', ':', 'Y', 'ù', 'é', 'y', '\u2009', 'M', 'À', 'r', 'J', '5', '%', '8', '-', 'â', '\u202f', '.', 'i', 'l', 'P', 'E', 'z', 'œ', ',', 'I', '2', '\xa0', 'G', 's', 'a', 'ç', 'U', '!', '«', 'v', 'w', 'n', 'x', 'c', 'C', 'O', '$', 'd', 'H', 'h', 'b', '3', 'o', 't', 'q', 'D', 'T', 'p', 'F', 'ô', 'î', 'ê', 'j', 'K', 'è', 'V', 'N', 'e', '1', '9', '0', 'B', 'ï', 'L', '»', 'à', '&', '(', "'", 'f', '\t', 'Q', ' ', 'k', 'Ê'}
****** input_texts *******
['Go.', 'Go.', 'Go.', 'Hi.', 'Hi.', 'Run!', 'Run!', 'Run!', 'Run!', 

In [35]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(output_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(output_characters)
max_encoder_seq_length = max([len(text) for text in input_texts])
max_decoder_seq_length = max([len(text) for text in target_texts])

In [36]:
print('Number of samples: ', len(input_texts))
print('Number of unique input tokens: ', num_encoder_tokens)
print('Number of unique output tokens: ', num_decoder_tokens)
print('Max sequence length for input: ', max_encoder_seq_length)
print('Max sequence length for outputs: ', max_decoder_seq_length)

Number of samples:  10000
Number of unique input tokens:  71
Number of unique output tokens:  93
Max sequence length for input:  15
Max sequence length for outputs:  59


In [37]:
input_token_index = dict([(char,i) for i,char in enumerate(input_characters)])
target_token_index = dict([(char,i) for i,char in enumerate(target_characters)])

In [40]:
print(input_token_index)
print('target_token_index:\n',target_token_index)

{' ': 0, '!': 1, '"': 2, '$': 3, '%': 4, '&': 5, "'": 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '5': 14, '7': 15, '8': 16, '9': 17, ':': 18, '?': 19, 'A': 20, 'B': 21, 'C': 22, 'D': 23, 'E': 24, 'F': 25, 'G': 26, 'H': 27, 'I': 28, 'J': 29, 'K': 30, 'L': 31, 'M': 32, 'N': 33, 'O': 34, 'P': 35, 'Q': 36, 'R': 37, 'S': 38, 'T': 39, 'U': 40, 'V': 41, 'W': 42, 'Y': 43, 'a': 44, 'b': 45, 'c': 46, 'd': 47, 'e': 48, 'f': 49, 'g': 50, 'h': 51, 'i': 52, 'j': 53, 'k': 54, 'l': 55, 'm': 56, 'n': 57, 'o': 58, 'p': 59, 'q': 60, 'r': 61, 's': 62, 't': 63, 'u': 64, 'v': 65, 'w': 66, 'x': 67, 'y': 68, 'z': 69, 'é': 70}
target_token_index:
 {'\t': 0, '\n': 1, ' ': 2, '!': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '5': 17, '8': 18, '9': 19, ':': 20, '?': 21, 'A': 22, 'B': 23, 'C': 24, 'D': 25, 'E': 26, 'F': 27, 'G': 28, 'H': 29, 'I': 30, 'J': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 

https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html3

1) Turn the sentences into 3 Numpy arrays, encoder_input_data, decoder_input_data, decoder_target_data:

encoder_input_data is a 3D array of shape (num_pairs, max_english_sentence_length, num_english_characters) containing a one-hot vectorization of the English sentences.

decoder_input_data is a 3D array of shape (num_pairs, max_french_sentence_length, num_french_characters) containg a one-hot vectorization of the French sentences.

decoder_target_data is the same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :].

In [42]:
encoder_input_data = np.zeros((len(input_texts),max_encoder_seq_length,num_encoder_tokens), dtype='float32')
# Decoder will be french words & characters
decoder_input_data = np.zeros((len(input_texts),max_decoder_seq_length,num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros((len(input_texts),max_decoder_seq_length,num_decoder_tokens), dtype='float32')