# Imports

In [1]:
import string
import re
from pickle import dump
from pickle import load
from unicodedata import normalize
from numpy import array
from numpy import argmax
from numpy.random import rand
from numpy.random import shuffle
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

Using TensorFlow backend.


# Load data
http://www.manythings.org/anki/por-eng.zip



###From Upload System

In [0]:
from google.colab import files
uploaded = files.upload()

###From repository

In [2]:
!git clone https://github.com/ufrpe-mineracao-textos/projeto-de-mineracao-20192-traducao-de-texto.git

Cloning into 'projeto-de-mineracao-20192-traducao-de-texto'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (134/134), done.[K
remote: Compressing objects: 100% (121/121), done.[K
remote: Total 1194 (delta 33), reused 91 (delta 11), pack-reused 1060[K
Receiving objects: 100% (1194/1194), 137.71 MiB | 44.41 MiB/s, done.
Resolving deltas: 100% (47/47), done.


In [0]:
#file = open('por.txt', mode='rt', encoding='utf-8')
file = open('/content/projeto-de-mineracao-20192-traducao-de-texto/datasets/por.txt', mode='rt', encoding='utf-8')
data = file.read()
file.close()

#Preprocessing

###Split lines and phrases

In [0]:
lines = data.strip().split('\n')
pairs = [line.split('\t') for line in lines]

###Clean lines

In [0]:
cleaned = list()
re_print = re.compile('[^%s]' % re.escape(string.printable))
table = str.maketrans('', '', string.punctuation)
for pair in pairs:
  clean_pair = list()
  for line in pair:
    # normalize unicode characters
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    # tokenize on white space
    line = line.split()
    # convert to lowercase
    line = [word.lower() for word in line]
    # remove punctuation from each token
    line = [word.translate(table) for word in line]
    # remove non-printable chars form each token
    line = [re_print.sub('', w) for w in line]
    # remove tokens with numbers in them
    line = [word for word in line if word.isalpha()]
    # store as string
    clean_pair.append(' '.join(line))
  cleaned.append(clean_pair)

pairs_cleaned = array(cleaned)

###Save cleaned text

In [6]:
def save_data(data, filename):
  dump(data, open(filename, 'wb'))
  print('Saved: %s' % filename)

save_data(pairs_cleaned, 'eng-por.pkl')

Saved: eng-por.pkl


In [7]:
for i in range(100):
  print('[%s] => [%s]' % (pairs_cleaned[i,0], pairs_cleaned[i,1]))

[go] => [vai]
[go] => [va]
[hi] => [oi]
[run] => [corre]
[run] => [corra]
[run] => [corram]
[run] => [corre]
[run] => [corra]
[run] => [corram]
[who] => [quem]
[who] => [que]
[wow] => [uau]
[wow] => [nossa]
[wow] => [wow]
[fire] => [fogo]
[help] => [ajuda]
[help] => [socorro]
[jump] => [pule]
[jump] => [pulem]
[jump] => [pule]
[stop] => [pare]
[stop] => [parem]
[wait] => [espere]
[wait] => [espere]
[wait] => [esperem]
[go on] => [va]
[hello] => [oi]
[hello] => [alo]
[hello] => [ola]
[hello] => [alo]
[i ran] => [eu corri]
[i see] => [estou vendo]
[i try] => [eu tento]
[i try] => [tento]
[i won] => [ganhei]
[i won] => [eu venci]
[oh no] => [ah nao]
[relax] => [relaxe]
[relax] => [relaxa]
[shoot] => [tiro]
[smile] => [sorria]
[smile] => [sorriam]
[attack] => [atacar]
[attack] => [ataquem]
[attack] => [ataque]
[cheers] => [saude]
[freeze] => [parado]
[get up] => [levantese]
[get up] => [levantemse]
[get up] => [levantate]
[get up] => [levantese]
[get up] => [levantate]
[go now] => [va agor

##Train/Test Separation

In [12]:
n_sentences = 30000
dataset = pairs_cleaned[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:27000], dataset[27000:]
# save
save_data(dataset, 'eng-por-both.pkl')
save_data(train, 'eng-por-train.pkl')
save_data(test, 'eng-por-test.pkl')

Saved: eng-por-both.pkl
Saved: eng-por-train.pkl
Saved: eng-por-test.pkl


## Tokenizer

In [0]:
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

## Max Lenght

In [0]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [13]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare portuguese tokenizer
por_tokenizer = create_tokenizer(dataset[:, 1])
por_vocab_size = len(por_tokenizer.word_index) + 1
por_length = max_length(dataset[:, 1])
print('Portuguese Vocabulary Size: %d' % por_vocab_size)
print('Portuguese Max Length: %d' % (por_length))

English Vocabulary Size: 4257
English Max Length: 6
Portuguese Vocabulary Size: 7325
Portuguese Max Length: 12


#Neural Machine Translation

##Encode with one hot encode (word embedding)

In [0]:
def encode_sequences(tokenizer, length, lines):
  X = tokenizer.texts_to_sequences(lines)
  X = pad_sequences(X, maxlen=length, padding='post')
  return X

In [0]:
def encode_output(sequences, vocab_size):
  ylist = list()
  for sequence in sequences:
    encoded = to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)
  y = array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

##Prepare test and training data

In [0]:
trainX = encode_sequences(eng_tokenizer, eng_length, train[:,0])
trainY = encode_sequences(por_tokenizer, por_length, train[:,1])
trainY = encode_output(trainY, por_vocab_size)

testX = encode_sequences(eng_tokenizer, eng_length, test[:,0])
testY = encode_sequences(por_tokenizer, por_length, test[:,1])
testY = encode_output(testY, por_vocab_size)

##Model

In [0]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
  model = Sequential()
  model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
  model.add(LSTM(n_units))
  model.add(RepeatVector(tar_timesteps))
  model.add(LSTM(n_units, return_sequences=True))
  model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
  return model

In [18]:
model = define_model(eng_vocab_size, por_vocab_size, eng_length, por_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6, 256)            1089792   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 12, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 12, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 12, 7325)          1882525   
Total params: 4,022,941
Trainable params: 4,022,941
Non-trainable params: 0
_________________________________________________________________
None


##Train the model

In [19]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)




Train on 27000 samples, validate on 3000 samples
Epoch 1/30





 - 140s - loss: 2.3626 - val_loss: 1.9685

Epoch 00001: val_loss improved from inf to 1.96854, saving model to model.h5
Epoch 2/30
 - 136s - loss: 1.9034 - val_loss: 1.8833

Epoch 00002: val_loss improved from 1.96854 to 1.88327, saving model to model.h5
Epoch 3/30
 - 137s - loss: 1.8303 - val_loss: 1.8382

Epoch 00003: val_loss improved from 1.88327 to 1.83818, saving model to model.h5
Epoch 4/30
 - 136s - loss: 1.7485 - val_loss: 1.7460

Epoch 00004: val_loss improved from 1.83818 to 1.74605, saving model to model.h5
Epoch 5/30
 - 136s - loss: 1.6388 - val_loss: 1.6346

Epoch 00005: val_loss improved from 1.74605 to 1.63458, saving model to model.h5
Epoch 6/30
 - 137s - loss: 1.4953 - val_loss: 1.5026

Epoch 00006: val_loss improved from 1.63458 to 1.50255, saving model to model.h5
Epoch 7/30
 - 136s - loss: 1.3502 - val_loss: 1.3898

Epoch 00007: val_loss improved from 1.50255 to 1.38980, saving model to model.h5
Ep

<keras.callbacks.History at 0x7f6c5df33a58>

#Evaluate model

In [0]:
def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
  return None

In [0]:
def predict_sequence(model, tokenizer, source):
  prediction = model.predict(source, verbose=0)[0]
  integers = [argmax(vector) for vector in prediction]
  target = list()
  for i in integers:
    word = word_for_id(i, tokenizer)
    if word is None:
      break
    target.append(word)
  return ' '.join(target)

In [0]:
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, tokenizer, source)
		raw_target, raw_src = raw_dataset[i,1], raw_dataset[i,0]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [23]:
model = load_model('model.h5')
print('Train')
evaluate_model(model, por_tokenizer, trainX, train)
print('Test')
evaluate_model(model, por_tokenizer, testX, test)

Train
src=[listen to the rain], target=[escute a chuva], predicted=[escutem a chuva]
src=[hang on], target=[aguardem], predicted=[aguardem]
src=[many people hunt], target=[muitas pessoas cacam], predicted=[muitas pessoas cacam]
src=[let me help you], target=[deixeme te ajudar], predicted=[deixeme ajudala]
src=[he works very hard], target=[ele trabalha bastante], predicted=[ele trabalha muito]
src=[ive been running], target=[tenho corrido], predicted=[eu estive correndo]
src=[hows tom doing], target=[como o tom esta], predicted=[como o tom esta indo]
src=[he lives in boston], target=[ele mora em boston], predicted=[ele vive em boston]
src=[we have a plan], target=[temos um plano], predicted=[nos temos plano plano]
src=[tom has arrived], target=[tom chegou], predicted=[tom chegou]
BLEU-1: 0.766687
BLEU-2: 0.672548
BLEU-3: 0.606311
BLEU-4: 0.438873
Test
src=[youre taunting me], target=[estas a tratarme com sarcasmo], predicted=[a senhora de de]
src=[ill never tell], target=[eu nunca vou c

#Translating input text

In [0]:
def translate_text(model, src_text, src_tokenizer, src_length, tar_tokenizer):
  ltext = list()
  ltext.append(src_text)
  enconded_text = encode_sequences(src_tokenizer, src_length, ltext)
  translated_text = predict_sequence(model, tar_tokenizer, enconded_text)
  print('[%s] => [%s]' % (src_text, translated_text))
  return translated_text

In [28]:
model = load_model('model.h5')
eng_text = input('Write a phrase to translate: ')
translation = translate_text(model, eng_text, eng_tokenizer, eng_length, por_tokenizer)

Write a phrase to translate: i have an idea
[i have an idea] => [tenho uma uma ideia]
