# Imports

In [0]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from numpy.random import rand
from numpy.random import shuffle
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# Load data
http://www.manythings.org/anki/por-eng.zip



###From Upload System

In [0]:
from google.colab import files
uploaded = files.upload()

Saving por2.txt to por2.txt


In [0]:
file = open('por.txt', mode='rt', encoding='utf-8')
data = file.read()
file.close()

#Preprocessing

###Split lines and phrases

In [0]:
lines = data.strip().split('\n')
pairs = [line.split('\t') for line in lines]

###Clean lines

In [0]:
cleaned = list()
re_print = re.compile('[^%s]' % re.escape(string.printable))
table = str.maketrans('', '', string.punctuation)
for pair in pairs:
  clean_pair = list()
  for line in pair:
    # normalize unicode characters
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    # tokenize on white space
    line = line.split()
    # convert to lowercase
    line = [word.lower() for word in line]
    # remove punctuation from each token
    line = [word.translate(table) for word in line]
    # remove non-printable chars form each token
    line = [re_print.sub('', w) for w in line]
    # remove tokens with numbers in them
    line = [word for word in line if word.isalpha()]
    # store as string
    clean_pair.append(' '.join(line))
  cleaned.append(clean_pair)

pairs_cleaned = array(cleaned)

###Save cleaned text

In [69]:
def save_data(data, filename):
  dump(data, open(filename, 'wb'))
  print('Saved: %s' % filename)

save_data(pairs_cleaned, 'eng-por.pkl')

Saved: eng-por.pkl


In [70]:
for i in range(100):
  print('[%s] => [%s]' % (pairs_cleaned[i,0], pairs_cleaned[i,1]))

[go] => [vai]
[go] => [va]
[hi] => [oi]
[run] => [corre]
[run] => [corra]
[run] => [corram]
[run] => [corre]
[run] => [corra]
[run] => [corram]
[who] => [quem]
[who] => [que]
[wow] => [uau]
[wow] => [nossa]
[wow] => [wow]
[fire] => [fogo]
[help] => [ajuda]
[help] => [socorro]
[jump] => [pule]
[jump] => [pulem]
[jump] => [pule]
[stop] => [pare]
[stop] => [parem]
[wait] => [espere]
[wait] => [espere]
[wait] => [esperem]
[go on] => [va]
[hello] => [oi]
[hello] => [alo]
[hello] => [ola]
[hello] => [alo]
[i ran] => [eu corri]
[i see] => [estou vendo]
[i try] => [eu tento]
[i try] => [tento]
[i won] => [ganhei]
[i won] => [eu venci]
[oh no] => [ah nao]
[relax] => [relaxe]
[relax] => [relaxa]
[shoot] => [tiro]
[smile] => [sorria]
[smile] => [sorriam]
[attack] => [atacar]
[attack] => [ataquem]
[attack] => [ataque]
[cheers] => [saude]
[freeze] => [parado]
[get up] => [levantese]
[get up] => [levantemse]
[get up] => [levantate]
[get up] => [levantese]
[get up] => [levantate]
[go now] => [va agor

##Train/Test Separation

In [76]:
n_sentences = 10000
dataset = pairs_cleaned[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_data(dataset, 'eng-por-both.pkl')
save_data(train, 'eng-por-train.pkl')
save_data(test, 'eng-por-test.pkl')

Saved: eng-por-both.pkl
Saved: eng-por-train.pkl
Saved: eng-por-test.pkl


## Tokenizer

In [0]:
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

## Max Lenght

In [0]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [77]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare portuguese tokenizer
por_tokenizer = create_tokenizer(dataset[:, 1])
por_vocab_size = len(por_tokenizer.word_index) + 1
por_length = max_length(dataset[:, 1])
print('Portuguese Vocabulary Size: %d' % por_vocab_size)
print('Portuguese Max Length: %d' % (por_length))

English Vocabulary Size: 3570
English Max Length: 9
Portuguese Vocabulary Size: 5252
Portuguese Max Length: 12


#Neural Machine Translation

##Encode with one hot encode (word embedding)

In [0]:
def encode_sequences(tokenizer, length, lines):
  X = tokenizer.texts_to_sequences(lines)
  X = pad_sequences(X, maxlen=length, padding='post')
  return X

In [0]:
def encode_output(sequences, vocab_size):
  ylist = list()
  for sequence in sequences:
    encoded = to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)
  y = array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

##Prepare test and training data

In [0]:
trainX = encode_sequences(eng_tokenizer, eng_length, train[:,0])
trainY = encode_sequences(por_tokenizer, por_length, train[:,1])
trainY = encode_output(trainY, por_vocab_size)

testX = encode_sequences(eng_tokenizer, eng_length, test[:,0])
testY = encode_sequences(por_tokenizer, por_length, test[:,1])
testY = encode_output(testY, por_vocab_size)

##Model

In [0]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
  model = Sequential()
  model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
  model.add(LSTM(n_units))
  model.add(RepeatVector(tar_timesteps))
  model.add(LSTM(n_units, return_sequences=True))
  model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
  return model

In [84]:
model = define_model(eng_vocab_size, por_vocab_size, eng_length, por_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 9, 256)            913920    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 12, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 12, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 12, 5252)          1349764   
Total params: 3,314,308
Trainable params: 3,314,308
Non-trainable params: 0
_________________________________________________________________
None


##Train the model

In [88]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)




Train on 9000 samples, validate on 1000 samples
Epoch 1/30





 - 69s - loss: 3.5600 - val_loss: 2.8094

Epoch 00001: val_loss improved from inf to 2.80940, saving model to model.h5
Epoch 2/30
 - 65s - loss: 2.7342 - val_loss: 2.7419

Epoch 00002: val_loss improved from 2.80940 to 2.74192, saving model to model.h5
Epoch 3/30
 - 66s - loss: 2.6428 - val_loss: 2.6906

Epoch 00003: val_loss improved from 2.74192 to 2.69058, saving model to model.h5
Epoch 4/30
 - 65s - loss: 2.5871 - val_loss: 2.6776

Epoch 00004: val_loss improved from 2.69058 to 2.67759, saving model to model.h5
Epoch 5/30
 - 68s - loss: 2.5426 - val_loss: 2.6527

Epoch 00005: val_loss improved from 2.67759 to 2.65270, saving model to model.h5
Epoch 6/30
 - 67s - loss: 2.5034 - val_loss: 2.6423

Epoch 00006: val_loss improved from 2.65270 to 2.64231, saving model to model.h5
Epoch 7/30
 - 67s - loss: 2.4692 - val_loss: 2.6165

Epoch 00007: val_loss improved from 2.64231 to 2.61648, saving model to model.h5
Epoch 8/30

<keras.callbacks.History at 0x7fb0f8509be0>