In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from pickle import load
import numpy as np
from numpy.random import rand
from numpy.random import shuffle

In [None]:
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.losses import *
from tensorflow.keras import optimizers

In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
!wget http://www.manythings.org/anki/spa-eng.zip

--2020-06-16 06:26:09--  http://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 172.67.173.198, 104.24.108.196, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4781548 (4.6M) [application/zip]
Saving to: ‘spa-eng.zip’


2020-06-16 06:26:12 (1.61 MB/s) - ‘spa-eng.zip’ saved [4781548/4781548]



In [None]:
!unzip spa-eng.zip

Archive:  spa-eng.zip
  inflating: _about.txt              
  inflating: spa.txt                 


In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t')[:2] for line in  lines]
	return pairs
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [None]:
# load dataset
filename = 'spa.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-spanish.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-spanish.pkl
[go] => [ve]
[go] => [vete]
[go] => [vaya]
[go] => [vayase]
[hi] => [hola]
[run] => [corre]
[run] => [corran]
[run] => [corra]
[run] => [corred]
[run] => [corred]
[who] => [quien]
[wow] => [orale]
[fire] => [fuego]
[fire] => [incendio]
[fire] => [disparad]
[help] => [ayuda]
[help] => [socorro auxilio]
[help] => [auxilio]
[jump] => [salta]
[jump] => [salte]
[stop] => [parad]
[stop] => [para]
[stop] => [pare]
[wait] => [espera]
[wait] => [esperen]
[go on] => [continua]
[go on] => [continue]
[hello] => [hola]
[i hid] => [me oculte]
[i hid] => [me escondi]
[i hid] => [me ocultaba]
[i hid] => [me escondia]
[i ran] => [corri]
[i ran] => [corria]
[i try] => [lo intento]
[i won] => [he ganado]
[oh no] => [oh no]
[relax] => [tomatelo con soda]
[shoot] => [fuego]
[shoot] => [disparad]
[shoot] => [disparen]
[shoot] => [dispara]
[shoot] => [dispara]
[shoot] => [dispare]
[smile] => [sonrie]
[attack] => [al ataque]
[attack] => [atacad]
[attack] => [ataque]
[attack] => [ata

In [None]:
for i in range(10):
	print(clean_pairs[i])

['go' 've']
['go' 'vete']
['go' 'vaya']
['go' 'vayase']
['hi' 'hola']
['run' 'corre']
['run' 'corran']
['run' 'corra']
['run' 'corred']
['run' 'corred']


In [None]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

#EVALUATION FUNCTIONS
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
  predict = model.predict(source, verbose=0)[0]
  integers = []
  for pred in predict:
    integers.append(np.argmax(pred))
  target = []
  for i in integers:
    word = word_for_id(i, tokenizer)
    if word is None:
      break
    target.append(word)
  return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
  actual = []
  predicted = []
  b1=0
  b2=0
  b3=0
  b4=0
  count = 0
  for i, source in enumerate(sources):    # translate encoded source text
    source = source.reshape((1, source.shape[0]))
    translated = predict_sequence(model, tokenizer, source)
    raw_src, raw_tgt = raw_dataset[i]
    if i<10:
      print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_tgt, translated))
      count = count+1
    else:
      break
    actual.append([raw_tgt.split()])
    predicted.append(translated.split())
    b1=b1+ corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
    b2=b2+  corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
    b3=b3+ corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0))
    b4=b4+ corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
  print(b1/count,b2/count,b3/count,b4/count)
    

In [None]:
# load dataset
raw_dataset = load_clean_sentences('english-spanish.pkl')

# reduce dataset size
n_sentences = 100000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:49000], dataset[49000:]
# save
save_clean_data(dataset, 'english-spanish-both.pkl')
save_clean_data(train, 'english-spanish-train.pkl')
save_clean_data(test, 'english-spanish-test.pkl')

Saved: english-spanish-both.pkl
Saved: english-spanish-train.pkl
Saved: english-spanish-test.pkl


In [None]:
dataset = load_clean_sentences('english-spanish-both.pkl')
train = load_clean_sentences('english-spanish-train.pkl')
test = load_clean_sentences('english-spanish-test.pkl')

In [None]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
es_tokenizer = create_tokenizer(dataset[:, 1])
es_vocab_size = len(es_tokenizer.word_index) + 1
es_length = max_length(dataset[:, 1])
print('Spanish Vocabulary Size: %d' % es_vocab_size)
print('Spanish Max Length: %d' % (es_length))

English Vocabulary Size: 10821
English Max Length: 11
Spanish Vocabulary Size: 20432
Spanish Max Length: 15


In [None]:
# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(es_tokenizer, es_length, train[:, 1])
#trainY = encode_output(trainY, es_vocab_size)
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(es_tokenizer, es_length, test[:, 1])
#testY = encode_output(testY, es_vocab_size)

In [None]:
!pip install keras-self-attention
from keras_self_attention import SeqSelfAttention



In [None]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model
def define_model_drop(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units,dropout_rate,embed_dim):
	model = Sequential()
	model.add(Embedding(src_vocab, embed_dim, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units,dropout = dropout_rate))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True,dropout = dropout_rate))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model
def define_model_bi_drop(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units,dropout_rate,embed_dim):
	model = Sequential()
	model.add(Embedding(src_vocab, embed_dim, input_length=src_timesteps, mask_zero=True))
	model.add(Bidirectional(LSTM(n_units,dropout = dropout_rate)))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True,dropout = dropout_rate))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model
def define_model_bi_1_drop(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units,n_units1,dropout_rate,embed_dim):
  model = Sequential()
  model.add(Embedding(src_vocab, embed_dim, input_length=src_timesteps, mask_zero=True))
  model.add(Bidirectional(LSTM(n_units,dropout = dropout_rate,return_sequences=True)))
  model.add(LSTM(n_units1,dropout=dropout_rate))
  model.add(RepeatVector(tar_timesteps))
  model.add(LSTM(n_units, return_sequences=True,dropout = dropout_rate))
  model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
  return model
def define_model_bi_att_drop(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units,dropout_rate,embed_dim):
  model = Sequential()
  model.add(Embedding(src_vocab, embed_dim, input_length=src_timesteps, mask_zero=True))
  model.add(Bidirectional(LSTM(n_units,dropout = dropout_rate,return_sequences = True)))
  model.add(SeqSelfAttention(attention_activation='sigmoid'))
  model.add(RepeatVector(tar_timesteps))
  model.add(LSTM(n_units, return_sequences=True,dropout = dropout_rate))
  model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
  return model

In [None]:
n_units = 200
n_units1= 100
dropout_rate = 0.2
embed_dim = 300
model = define_model_bi_1_drop(eng_vocab_size, es_vocab_size, eng_length, es_length, n_units,n_units1,dropout_rate,embed_dim)
# model = define_model_att_drop(eng_vocab_size, es_vocab_size, eng_length, es_length, n_units,dropout_rate,embed_dim)
# model = define_model_bi_drop(eng_vocab_size, es_vocab_size, eng_length, es_length, n_units,dropout_rate,embed_dim)
# model = define_model_drop(eng_vocab_size, es_vocab_size, eng_length, es_length, n_units,dropout_rate,embed_dim)
# model = define_model(eng_vocab_size, es_vocab_size, eng_length, es_length, n_units)
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 11, 300)           3246300   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 11, 400)           801600    
_________________________________________________________________
lstm_24 (LSTM)               (None, 100)               200400    
_________________________________________________________________
repeat_vector_8 (RepeatVecto (None, 15, 100)           0         
_________________________________________________________________
lstm_25 (LSTM)               (None, 15, 200)           240800    
_________________________________________________________________
time_distributed_8 (TimeDist (None, 15, 20432)         4106832   
Total params: 8,595,932
Trainable params: 8,595,932
Non-trainable params: 0
___________________________________________

In [None]:
def leave_embed(model):
  for l in model.layers:
    if "embedding" in l.name:
      l.trainable = False
  return model

In [None]:
%cd /content/
version = "bi_100_drop_0"
filename = 'model'+str(n_units)+'_'+str(version)+'.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
los = SparseCategoricalCrossentropy()
opt = optimizers.Adam(lr=0.001)
opt1 = optimizers.RMSprop(0.5)
model = load_model('model200_bi_100_drop_0.h5')
model = leave_embed(model)
model.compile(loss=los,optimizer=opt,metrics=['accuracy'])

model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=1)

/content
Epoch 1/30
Epoch 00001: val_loss improved from inf to 1.69535, saving model to model200_bi_100_drop_0.h5
Epoch 2/30
Epoch 00002: val_loss did not improve from 1.69535
Epoch 3/30
Epoch 00003: val_loss did not improve from 1.69535
Epoch 4/30
157/766 [=====>........................] - ETA: 36s - loss: 1.1855 - accuracy: 0.7507

KeyboardInterrupt: ignored

In [None]:
model = load_model('model150_bi_150_75_drop_0.h5')

print("TRAIN : \n")
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
evaluate_model(model, es_tokenizer, trainX, train)
print("\nTEST : \n")
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
evaluate_model(model, es_tokenizer, testX, test)

TRAIN : 

src=[would that have made it better], target=[eso se hubiera hecho mejor], predicted=[eso mejor haber hecho]
src=[she got up to answer the phone], target=[ella se levanto para contestar al telefono], predicted=[ella le a a telefono telefono]
src=[dont leave the light on], target=[no dejes la luz encendida], predicted=[no dejes abiertas luz encendida]
src=[it is no use crying over spilt milk], target=[no tiene caso llorar sobre leche derramada], predicted=[no no no de en en derramada derramada]
src=[tom is really good at math], target=[tom es realmente bueno en matematicas], predicted=[tom es bueno bueno de matematicas matematicas]
src=[tom has his own room], target=[tom tiene su propio dormitorio], predicted=[tom tiene ordenando cuarto dormitorio]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


src=[why didnt you stay with tom], target=[por que no te quedaste con tom], predicted=[por que no a a a tom]
src=[tom insisted on helping mary], target=[tom insistio en ayudar a mary], predicted=[tom insistio pedirle con mary]
src=[have you ever ridden a bicycle], target=[alguna vez te has subido a una bicicleta], predicted=[has vez has vez vez vez habitacion]
src=[tom seems to be as fit as a fiddle], target=[tom parece estar tan fino como un violin afinado], predicted=[tom parece que que fino fierro escritor]
0.4977703502125119 0.3387268006641949 0.3479552987533751 0.42412950757348283

TEST : 



Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


src=[could i have a piece of cheesecake], target=[me podrian dar un trozo de tarta de queso], predicted=[podrian podrian un pedazo pedazo de de]
src=[he finally decided to get married], target=[finalmente decidio casarse], predicted=[el se a a]
src=[that was magic], target=[eso fue magia], predicted=[eso es magico]
src=[is that too much to ask for], target=[es mucho pedir], predicted=[eso es demasiado pena para]
src=[go straight along this street], target=[siga derecho por esta calle], predicted=[vete recto en la]
src=[i wonder if tom will be at marys party], target=[me pregunto si tom ira a la fiesta de mary], predicted=[me pregunto a tom tom que a a la]
src=[you have three pens], target=[tienes tres lapices], predicted=[tienes tres]
src=[they are in the teachers room], target=[ellos estan en la sala de profesores], predicted=[ellos estan en el pieza]
src=[i dont intend do that], target=[no trato de hacerlo], predicted=[no no hacer eso]
src=[i wonder if anything happened to him], targ


<table width="900" align = "left" height="200" borderwidth = "3">
  <tr>
    <th>DATA_SIZE</th><th>n_units</th><th>dropout</th>
    <th>Bidirectional</th><th>embed_dim</th><th>performance</th>
    <th>bleu score</th>
  </tr>
  <tr><td>10000 (TOY)</td><td>256</td><td>NA</td>
    <td>NA</td><td>256</td><td>Good</td><td>NA</td></tr>
  <tr><td>50000 (TOY)</td><td>300</td><td>NA</td>
    <td>NA</td><td>300</td><td>Good</td><td>NA</td></tr>
  <tr> <td>100000</td><td>500</td><td>0.4</td><td>NA</td>
    <td>500</td><td>Fine</td><td>NA</td></tr>
  <tr><td>100000</td><td>400</td><td>0.2</td><td>NA</td>
    <td>300</td><td>Repetition but OK</td><td>NA</td></tr>
  <tr><td>100000</td><td>300</td><td>0.2</td><td>YES</td>
    <td>200</td><td>Repetition, POOR</td><td>NA</td></tr>
  <tr><td>100000</td><td>200</td><td>0.2</td><td>YES</td>
  <td>200</td> <td>IMPROVEMENT </td>
    <td>0.29 0.23 0.29.35</td></tr>
  <tr><td>100000</td><td>150_75</td><td>0.2</td><td>YES</td>
  <td>200</td> <td>REPETITIVE </td>
    <td>0.34 0.38 0.43 0.50</td></tr>
</table>

In [None]:
t = np.array(['i am cool','soy genial'],dtype='<U275').reshape((1,2))
X = encode_sequences(eng_tokenizer, eng_length,t[:,0] )
evaluate_model(model, es_tokenizer, X, t)

src=[i am cool], target=[soy genial], predicted=[estoy rezando]
0.0 0.0 0.0 0.0


In [None]:
t = np.array(['how are you','como estas'],dtype='<U275').reshape((1,2))
X = encode_sequences(eng_tokenizer, eng_length,t[:,0] )
evaluate_model(model, es_tokenizer, X, t)

src=[how are you], target=[como estas], predicted=[que estan]
0.0 0.0 0.0 0.0


In [None]:
t = np.array(['how are you brother','como estas hermano'],dtype='<U275').reshape((1,2))
X = encode_sequences(eng_tokenizer, eng_length,t[:,0] )
evaluate_model(model, es_tokenizer, X, t)

src=[how are you brother], target=[como estas hermano], predicted=[como tan como]
0.3333333333333333 0.5773502691896257 0.6959050465952276 0.7598356856515925


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
