In [88]:
from pickle import load
from numpy import array
import tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
#from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [89]:
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [90]:
def create_tokenizer(lines): #assigns id to words in lines vocab
	tokenizer = Tokenizer() #default filters punctuation
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [91]:
print(dataset[:, 0][0:10])

['tom was pleased' 'turn it off' 'are you busy' 'are you free'
 'were all happy' 'i like horses' 'i go jogging' 'go look for it'
 'i like them both' 'this is hers']


In [46]:
sent = ['hola que tal tal tal.', 'como estas hoy hola', 'mi perrito bello es','vamos a comer silpancho mas tarde']
t = create_tokenizer(sent)

In [47]:
print("The document count",t.document_count)
print("The count of words",len(t.word_counts))
print("The word index",t.word_index)

The document count 4
The count of words 16
The word index {'tal': 1, 'hola': 2, 'que': 3, 'como': 4, 'estas': 5, 'hoy': 6, 'mi': 7, 'perrito': 8, 'bello': 9, 'es': 10, 'vamos': 11, 'a': 12, 'comer': 13, 'silpancho': 14, 'mas': 15, 'tarde': 16}


In [92]:
def max_length(lines): #max words in a sentence
	return max(len(line.split()) for line in lines)

In [45]:
max_length(sent)

6

In [93]:
def encode_sequences(tokenizer, length, lines): #returns dim (#lines,length)
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines) #creates id array from tokenizer ids for each sentence
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post') # adds 0s to the end(post) of sequence
	return X

In [106]:
ba=encode_sequences(t,10, ["asd sdjid","hola como estas", "chau vamos", "que tal estas"])
print(ba.shape)
print(ba)

(4, 10)
[[ 0  0  0  0  0  0  0  0  0  0]
 [ 2  4  5  0  0  0  0  0  0  0]
 [11  0  0  0  0  0  0  0  0  0]
 [ 3  1  5  0  0  0  0  0  0  0]]


In [53]:
#to_categorical : builds one-hot encoding representation for input array
a = tensorflow.keras.utils.to_categorical([0, 1, 2, 3], num_classes=4)
a = tensorflow.constant(a, shape=[4, 4])
print(a)

tf.Tensor(
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]], shape=(4, 4), dtype=float32)


In [94]:
def encode_output(sequences, vocab_size): #returns 3d
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [63]:
encode_output(encode_sequences(t,10,["fine thanks","hola como estas","hola que como tal como"]), 15)

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0

In [102]:
#none = batch dimension
# batch size = # of samples in each batch during testing/training
# timestemps = # of values in a sequence -> max # of words in sentences
# features = # of dimensions to represent data
#					3856      2404       10            5              256							
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	#src_vocab shape = (#german_sent, max#words in germ sentence)
	#src_vocab shape = (3856, 10)
	#src_vocab = SIZE  = 3856

	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True)) 
	# 3856, 256 units outp, 10, 
	# mask_zero -> tells model 0 is a padding number and cannot be used as index for vocabulary
	# input dim = vocab size + 1

	#EMBEDDING LAYER : 
	# 	input(batch_size, input_length) 
	# 	->output (batch_size, input_length, output_dim)
	# (,10,256) - params = 3856x256

	#model.add(SimpleRNN(n_units,activation='relu'))
	#model.add(SimpleRNN())
	

	model.add(LSTM(n_units))
	#input 3D [batch_size, timesteps, feature] -> (,10,256)
	# output ,256 - params = 

	model.add(RepeatVector(tar_timesteps))
	# repeats input n times
	# input = (batch_size, input_length, output_dim)
	# tar_timesteps = english max word # in a sentence
	# output = (batch_size, tar_timesteps, output_dim)

	#model.add(SimpleRNN(n_units, return_sequences=True))
	model.add(LSTM(n_units, return_sequences=True))#default sigmoid activation
	# return_sequence = boolean , return full sequence if true, else return just output sequence
	# input (batch_size, input_length, c)
	# output 
	# 	return_seq TRUE:(batch_size, input_length, 256) - (,5,256)
	#	return_seq FALSE:(batch_size,256)

	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	# input : (batch_size, input_length, 256)
	# output: (batch, batch_size, input_length, 256)
	# outputs should have same function for every timestep, we dont want flattened output
	# softmax returns probability array
	# Dense output: tar_vocab legnth array of probabilities for each word
	# 256x2404 + 2404(biases) = 618828

	#model.add(Dense(tar_vocab, activation='softmax'))
	return model

In [85]:
#PRUEBITA
# from numpy import array
# import numpy as np
# from keras.models import Sequential
# from keras.layers import Dense, TimeDistributed, LSTM
# Input_Dim, Output_Dim = 15, 8
# Length = 64
# Sample_Size = 50
# X = np.random.random([Sample_Size,Length,Input_Dim]) #(50,64,15)
# y = np.random.random([Sample_Size,Length,Output_Dim]) #(50,64,8)
# model = Sequential()
# model.add(LSTM(32, input_shape=(64, 15), return_sequences=True)) #(10,64,32)
# model.add(TimeDistributed(Dense(8))) #(10,64,8)
# model.compile(loss='categorical_crossentropy', optimizer='adam')
# print(X.shape, y.shape)
# print(model.summary())
# model.fit(X, y, epochs=100)
# result = model.predict(X, batch_size=10, verbose=2)

(50, 64, 15) (50, 64, 8)
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, 64, 32)            1536      
_________________________________________________________________
time_distributed_4 (TimeDist (None, 64, 8)             264       
Total params: 1,800
Trainable params: 1,800
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100

In [80]:
inputs = tensorflow.random.normal([32, 10, 8])
print("shape inputs", inputs.shape)


shape inputs (32, 10, 8)


In [96]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [97]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))


English Vocabulary Size: 2404
English Max Length: 5


In [98]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

German Vocabulary Size: 3856
German Max Length: 10


In [99]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# x son secuencias encoded,
# y son one hot encoded

In [100]:
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [103]:
#usa las primeras 1000 oraciones
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['acc'])
#categorical cross entropy -> one hot encoding output
#sparse categorical cross entropy -> output as integers
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 256)           987136    
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 5, 2404)           617828    
Total params: 2,655,588
Trainable params: 2,655,588
Non-trainable params: 0
_________________________________________________________________
None
('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz

In [104]:
# fit model
#filename = 'model.h5'
#checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64,  validation_data=(testX, testY))#, callbacks=[checkpoint])#, verbose=2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1e1dd5d5a30>

In [123]:
from numpy import argmax
# asignar un número entero a una palabra
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
# generar la secuencia de origen del objetivo
def predict_sequence(model, tokenizer, source):
    #print(f"S {source}")
    prediction = model.predict(source, verbose=0)[0]
    #print(f"PREDICTION {prediction}")
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        #print(f"WORD{word}")
        if word is None:
            break
        target.append(word)
    return' '.join(target)

In [135]:
x_prediction_test=encode_sequences(ger_tokenizer, ger_length, ['Mach das aus.'])
print(x_prediction_test)
print(x_prediction_test[0])
predict_sequence(model,eng_tokenizer,x_prediction_test)

[[51  6 43  0  0  0  0  0  0  0]]
[51  6 43  0  0  0  0  0  0  0]


'turn it off'