    We followed the tutorial from https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/
    in order to build our machine translator, adapting it to use our english-spanish dataset.
    
    ##### Libraries needed:

In [1]:
from pickle import load
from numpy import array
import tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
#from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
import numpy as np

#### Data Preparation for the Neural Network:
    The goal is to have a sequence of words in english get transformed to a sequence of words in spanish. In this case, the input (x) of the neural network are the sentences in english, and the ouput should be the sentences in english(y). 
    Since neural networks do not understand text, we need to transorm them into numbers:
    In order to do this, we will create a tokenizer for each language, from the vocabulary in the dataset.
    A tokenizer creates a numerical id for each word in its vocabulary, therefore each word will be represented by a number. This will be donde for both languages, (x and y). 
    So, redefining what we said before, our input will be the corresponding ids (referenced by the tokenizer) from the sentences in english, and our outputs will be the probability of each each input word corresponding to the vocabulary of the target language(spanish). We will perform the steps to do this:
    

In [2]:
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [3]:
def create_tokenizer(lines): #assigns id to words in lines vocab
	tokenizer = Tokenizer() #default filters punctuation
	tokenizer.fit_on_texts(lines)
	return tokenizer

##### Simple example to show how a tokenizer works:

In [5]:
sent = ['hola que tal tal tal.', 'como estas hoy hola', 'mi perrito bello es','vamos a comer silpancho mas tarde']
t = create_tokenizer(sent)

##### Viewing the ids assigned to each word from the example vocabulary:

In [6]:
print("The document count",t.document_count)
print("The count of words",len(t.word_counts))
print("The word index",t.word_index)

The document count 4
The count of words 16
The word index {'tal': 1, 'hola': 2, 'que': 3, 'como': 4, 'estas': 5, 'hoy': 6, 'mi': 7, 'perrito': 8, 'bello': 9, 'es': 10, 'vamos': 11, 'a': 12, 'comer': 13, 'silpancho': 14, 'mas': 15, 'tarde': 16}


In [7]:
def max_length(lines): #max words in a sentence
	return max(len(line.split()) for line in lines)

In [8]:
max_length(sent)

6

##### SENTENCE ENCODING:
    Once the tokenizer is created, we have to map the dataset's sentences into the ids from the tokenizer.
    Additionaly, for the input arrays to be the same length(the max length from the dataset sentences), we add 0s at the end as padding:

In [9]:
def encode_sequences(tokenizer, length, lines): #returns dim (#lines,length)
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	#creates id array from tokenizer ids for each sentence
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post') # adds 0s to the end(post) of sequence
	return X

In [10]:
ba=encode_sequences(t,10, ["asd sdjid","hola como estas", "chau vamos", "que tal estas"])
print(ba.shape)
print(ba)

(4, 10)
[[ 0  0  0  0  0  0  0  0  0  0]
 [ 2  4  5  0  0  0  0  0  0  0]
 [11  0  0  0  0  0  0  0  0  0]
 [ 3  1  5  0  0  0  0  0  0  0]]


##### The tutorial we followed used the output y as one hot encoding. This could be used for small number of data, but we will try to scale the dataset later, so this is not convenient for us. Anyway, we did use it for the best experiment we found, to compare it with the simple array output.

In [11]:
#to_categorical : builds one-hot encoding representation for input array
a = tensorflow.keras.utils.to_categorical([0, 1, 2, 3], num_classes=4)
a = tensorflow.constant(a, shape=[4, 4])
print(a)

tf.Tensor(
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]], shape=(4, 4), dtype=float32)


In [12]:
def encode_output(sequences, vocab_size): #returns 3d
	ylist = np.array([])
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size,dtype=int)
		ylist=np.append(ylist,encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

##### As we can see in the following exmple, even for a small vocabulary, we have a big one hot matrix already:

In [13]:
encode_output(encode_sequences(t,10,["fine thanks","hola como estas","hola que como tal como"]), 15)

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0

##### Sample model:
    This was the base model from the tutorial:
    LAYERS:
        -EMBEDDING
        -LSTM
        -REPEAT VECTOR
        -LSTM
        -TIME DISTRIBUTED (DENSE)

In [14]:
#none = batch dimension
# batch size = # of samples in each batch during testing/training
# timestemps = # of values in a sequence -> max # of words in sentences
# features = # of dimensions to represent data
#					3856      2404       10            5              256							
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	#src_vocab shape = (#german_sent, max#words in germ sentence)
	#src_vocab shape = (3856, 10)
	#src_vocab = SIZE  = 3856

	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True)) 
	# 3856, 256 units outp, 10, 
	# mask_zero -> tells model 0 is a padding number and cannot be used as index for vocabulary
	# input dim = vocab size + 1

	#EMBEDDING LAYER : 
	# 	input(batch_size, input_length) 
	# 	->output (batch_size, input_length, output_dim)
	# (,10,256) - params = 3856x256

	#model.add(SimpleRNN(n_units,activation='relu'))
	#model.add(SimpleRNN())
	

	model.add(LSTM(n_units))
	#input 3D [batch_size, timesteps, feature] -> (,10,256)
	# output ,256 - params = 

	model.add(RepeatVector(tar_timesteps))
	# repeats input n times
	# input = (batch_size, input_length, output_dim)
	# tar_timesteps = english max word # in a sentence
	# output = (batch_size, tar_timesteps, output_dim)

	#model.add(SimpleRNN(n_units, return_sequences=True))
	model.add(LSTM(n_units, return_sequences=True))#default sigmoid activation
	# return_sequence = boolean , return full sequence if true, else return just output sequence
	# input (batch_size, input_length, c)
	# output 
	# 	return_seq TRUE:(batch_size, input_length, 256) - (,5,256)
	#	return_seq FALSE:(batch_size,256)

	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	# input : (batch_size, input_length, 256)
	# output: (batch, batch_size, input_length, 256)
	# outputs should have same function for every timestep, we dont want flattened output
	# softmax returns probability array
	# Dense output: tar_vocab legnth array of probabilities for each word
	# 256x2404 + 2404(biases) = 618828

	#model.add(Dense(tar_vocab, activation='softmax'))
	return model

In [16]:
inputs = tensorflow.random.normal([32, 10, 8])
print("shape inputs", inputs.shape)


shape inputs (32, 10, 8)


In [None]:
def load_data(ds_filename, train_ds_fn, test_ds_fn):    
    dataset = load_clean_sentences(ds_filename)
    train = load_clean_sentences(train_ds_fn)
    test = load_clean_sentences(test_ds_fn)
    return dataset,train,test

In [None]:
def prepare_tokenizer(dataset):
    tokenizer = create_tokenizer(dataset[:, 0])
    vocab_size = len(tokenizer.word_index) + 1
    max_sentence_length = max_length(dataset[:, 0])
    return tokenizer,vocab_size,max_sentence_length

In [None]:
def preprocess_input(origin_tok, origin_max_sent_length, target_tok, target_max_sent_length,target_vocab_size, data, one_hot=False):
    dataX = encode_sequences(origin_tok, origin_max_sent_length, data[:, 1])
    dataY = encode_sequences(target_tok, target_max_sent_length, data[:, 0])
    if one_hot:
        dataY = encode_output(dataY, target_vocab_size)
    return dataX,dataY


In [None]:
import matplotlib.pyplot as plt
def graph_loss_vs_epochs(history):
    training_loss = history.history['loss']
    test_loss = history.history['val_loss']

    # Create count of the number of epochs
    epoch_count = range(1, len(training_loss) + 1)

    # Visualize loss history
    plt.plot(epoch_count, training_loss, 'r--')
    plt.plot(epoch_count, test_loss, 'b-')
    plt.legend(['Training Loss', 'Test Loss'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [None]:
def train_evaluate_model(trainX, trainY, testX,testY, epochs, batch_size, model, model_save_file_name):
    checkpoint = ModelCheckpoint(model_save_file_name, monitor='val_loss', verbose=1,save_best_only=True, mode='min')
    model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size,  validation_data=(testX, testY))
    history=model.history
    

In [None]:
def create_model(model, units,model_sum_im_name,loss_func='categorical_crossentropy',learning_rate=0.001):
    optimizer = keras.optimizers.Adam(learning_rate)
    model.compile(optimizer=optimizer, loss=loss_func,metrics=['acc'])
    #categorical cross entropy -> one hot encoding output
    #sparse categorical cross entropy -> output as integers
    # summarize defined model
    print(model.summary())
    plot_model(model, to_file=model_sum_im_name, show_shapes=True)

##### Steps for training the model:

In [None]:
dataset,train,testset=load_data(ds_filename, train_ds_fn, test_ds_fn)
tokenizer,vocab_size,max_sentence_length = prepare_tokenizer(dataset)
model = define_model(origin_vocab_size, target_vocab_size, origin_max_sent_length, target_max_sent_length, units)
create_model(model, units,loss_func='categorical_crossentropy',learning_rate=0.001)
train_evaluate_model()

In [17]:
# load datasets
dataset = load_clean_sentences('english-spanish-both.txt')
train = load_clean_sentences('english-spanish-train.txt')
test = load_clean_sentences('english-spanish-test.txt')

In [18]:
print(train.shape, test.shape, dataset.shape)
print(train[100:200])

(8000, 2) (2000, 2) (117788, 2)
[['fine today' 'hoy hace bueno']
 ['can we go' 'podemos ir']
 ['you were busy' 'estabas ocupada']
 ['can you whistle' 'podes silbar']
 ['tom speak' 'tom no puede hablar']
 ['it like' 'como es']
 ['french' 'soy frances']
 ['i like them all' 'me gustan todos']
 ['it was my fault' 'fue mi culpa']
 ['who sells this' 'quien vende esto']
 ['is it a trap' 'acaso es una trampa']
 ['on a diet' 'estoy a dieta']
 ['new' 'estan nuevas']
 ['tom might faint' 'tom quizas podria desfallecer']
 ['tom saw me' 'tom me vio']
 ['how was boston' 'que tal en boston']
 ['how do you feel' 'como os sentis']
 ['do you trust me' 'confias en mi']
 ['go get it' 'vete a por ello']
 ['do it again' 'lo volveria a hacer']
 ['what a jerk' 'que pendejo']
 ['tom did not cry' 'tom no lloro']
 ['a book' 'eso es un libro']
 ['this is a pen' 'esto es un boligrafo']
 ['know' 'lo sabran']
 ['dad home' 'papa no esta en casa']
 ['i need ice' 'necesito hielo']
 ['i live in hyogo' 'yo vivo en hyogo']

In [19]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))


English Vocabulary Size: 12715
English Max Length: 47


In [20]:
# prepare german tokenizer
spa_tokenizer = create_tokenizer(dataset[:, 1])
spa_vocab_size = len(spa_tokenizer.word_index) + 1
spa_length = max_length(dataset[:, 1])
print('Spanish Vocabulary Size: %d' % spa_vocab_size)
print('Spanish Max Length: %d' % (spa_length))

Spanish Vocabulary Size: 24581
Spanish Max Length: 49


In [None]:
# prepare validation data
testX = encode_sequences(spa_tokenizer, spa_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [None]:
#usa las primeras 1000 oraciones
# define model
learning_rate=0.001
model = define_model(spa_vocab_size, eng_vocab_size, spa_length, eng_length, 256)
optimizer = keras.optimizers.Adam(learning_rate)
model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics=['acc'])
#categorical cross entropy -> one hot encoding output
#sparse categorical cross entropy -> output as integers
# summarize defined model
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)

In [None]:
# fit model
#filename = 'model.h5'
#checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64,  validation_data=(testX, testY))#, callbacks=[checkpoint])#, verbose=2)

#### Making a prediction:

In [None]:
from numpy import argmax
# asignar un número entero a una palabra
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
# generar la secuencia de origen del objetivo
def predict_sequence(model, tokenizer, source):
    #print(f"S {source}")
    prediction = model.predict(source, verbose=0)[0]
    #print(f"PREDICTION {prediction}")
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        #print(f"WORD{word}")
        if word is None:
            break
        target.append(word)
    return' '.join(target)

In [None]:
x_prediction_test=encode_sequences(spa_tokenizer, spa_length, ['Hola como estas?'])
print(x_prediction_test)
print(x_prediction_test[0])
predict_sequence(model,eng_tokenizer,x_prediction_test)