In [0]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [0]:
%cd /content/gdrive/My\ Drive/Colab\ Notebooks/Trabalho_2_PLN/

In [0]:
!sudo apt-get install libicu-dev
!pip install polyglot
!pip install PyICU
!pip install pycld2

In [0]:
%tensorflow_version 1.x
import string
import re
import pandas as pd

import pickle
from pickle import load
from pickle import dump

import numpy as np
from numpy import array, argmax, random, take
from numpy.random import rand
from numpy.random import shuffle
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector, TimeDistributed, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model

from unicodedata import normalize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from polyglot.detect import Detector


# Rede 1

In [0]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

#make the dataset with only english->portuguese
def clean_trash(sentences):
	y=0
	nova_lista = list()
	for x in range(0, len(sentences)):
		try:
			if Detector(sentences[x][0], quiet = True).language.name == "English":
				nova_lista.append(sentences[x])
		except:
			y += 1

	for x in range(x, 10):
		print(nova_lista[x])
	return nova_lista


# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = 'dataset_en_pt.txt'
doc = load_doc(filename)
# split into english-portuguese pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
#make the dataset with only english->portuguese
dataset_free = clean_trash(clean_pairs)

save_clean_data(dataset_free, 'english-portuguese.pkl')
# spot check
for i in range(5):
	print('[%s] => [%s]' % (clean_pairs[i][0], clean_pairs[i][1]))

In [0]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-portuguese.pkl')

# reduce dataset size
n_sentences = 30000
dataset = raw_dataset[:n_sentences]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = train_test_split(dataset, test_size=0.2, random_state = 12)
# save
save_clean_data(dataset, 'english-portuguese-both.pkl')
save_clean_data(train, 'english-portuguese-train.pkl')
save_clean_data(test, 'english-portuguese-test.pkl')
print(dataset[:10])
print(train[:10])
print(test[:10])
print(len(dataset), len(train), len(test))

In [0]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(Dropout(rate=0.7))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('english-portuguese-both.pkl')
train = load_clean_sentences('english-portuguese-train.pkl')
test = load_clean_sentences('english-portuguese-test.pkl')
dataset_ingles = list()
dataset_portugues = list()
for x in range(0, len(dataset)):
  dataset_ingles.append(dataset[x][0])
  dataset_portugues.append(dataset[x][1])
print(len(dataset))
train_ingles = list()
train_portugues = list()
for x in range(0, len(train)):
  train_ingles.append(train[x][0])
  train_portugues.append(train[x][1])

test_ingles = list()
test_portugues = list()
for x in range(0, len(test)):
  test_ingles.append(test[x][0])
  test_portugues.append(test[x][1])

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset_ingles)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset_ingles)
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare portuguese tokenizer
por_tokenizer = create_tokenizer(dataset_portugues)
por_vocab_size = len(por_tokenizer.word_index) + 1
por_length = max_length(dataset_portugues)
print('Portuguese Vocabulary Size: %d' % por_vocab_size)
print('Portuguese Max Length: %d' % (por_length))


# prepare training data
trainX = encode_sequences(por_tokenizer, por_length, train_portugues)
trainY = encode_sequences(eng_tokenizer, eng_length, train_ingles)
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(por_tokenizer, por_length, test_portugues)
testY = encode_sequences(eng_tokenizer, eng_length, test_ingles)
testY = encode_output(testY, eng_vocab_size)
print('Portuguese shape: ', testX.shape)
print('English shape: ', testY.shape)
# define model
model = define_model(por_vocab_size, eng_vocab_size, por_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model1.png', show_shapes=True)
# fit model
filename = 'model1.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history = model.fit(trainX, trainY, epochs=50, batch_size=32, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

In [0]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
 
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer
 
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X
 
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
 
# # load datasets
# dataset = load_clean_sentences('english-portuguese-both.pkl')
# train = load_clean_sentences('english-portuguese-train.pkl')
# test = load_clean_sentences('english-portuguese-test.pkl')
# dataset_ingles = list()
# dataset_portugues = list()
# for x in range(0, len(dataset)):
#   dataset_ingles.append(dataset[x][0])
#   dataset_portugues.append(dataset[x][1])

# train_ingles = list()
# train_portugues = list()
# for x in range(0, len(train)):
#   train_ingles.append(train[x][0])
#   train_portugues.append(train[x][1])

# test_ingles = list()
# test_portugues = list()
# for x in range(0, len(test)):
#   test_ingles.append(test[x][0])
#   test_portugues.append(test[x][1])

# # prepare english tokenizer64
# eng_tokenizer = create_tokenizer(dataset_ingles)
# eng_vocab_size = len(eng_tokenizer.word_index) + 1
# eng_length = max_length(dataset_ingles)
# # prepare german tokenizer
# por_tokenizer = create_tokenizer(dataset_portugues)
# por_vocab_size = len(por_tokenizer.word_index) + 1
# por_length = max_length(dataset_portugues)
# # prepare data
# trainX = encode_sequences(eng_tokenizer, eng_length, train_ingles)
# testX = encode_sequences(eng_tokenizer, eng_length, test_ingles)
 
# load model
model = load_model('model1.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

# Rede 2

In [0]:
# empty lists
eng_l = []
port_l = []

dataset_ingles = list()
dataset_portugues = list()

for x in range(0, len(eng_port)):
  dataset_ingles.append(eng_port[x][0])
  dataset_portugues.append(eng_port[x][1])
dataset_ingles = dataset_ingles[:20000]
dataset_portugues = dataset_portugues[:20000]
# populate the lists with sentence lengths
for i in range(0,20000):
      eng_l.append(len(dataset_ingles[i]))

for i in range(0,20000):
      port_l.append(len(dataset_portugues[i]))

length_df = pd.DataFrame({'eng':eng_l, 'port':port_l})

length_df.hist(bins = 30)
plt.show()

In [0]:
# function to build a tokenizer
def tokenization(lines):
      tokenizer = Tokenizer()
      tokenizer.fit_on_texts(lines)
      return tokenizer

# prepare english tokenizer
eng_tokenizer = tokenization(dataset_ingles)
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8
print('English Vocabulary Size: %d' % eng_vocab_size)

# prepare portuguese tokenizer
port_tokenizer = tokenization(dataset_portugues)
port_vocab_size = len(port_tokenizer.word_index) + 1

port_length = 8
print('Portuguese Vocabulary Size: %d' % port_vocab_size)

In [0]:
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq
         
eng_pt = list(map(list, zip(dataset_ingles, dataset_portugues)))


# split data into train and test set
train, test = train_test_split(eng_pt, test_size=0.2, random_state = 12)

train_ingles = list()
train_portugues = list()
for x in range(0, len(train)):
  train_ingles.append(train[x][0])
  train_portugues.append(train[x][1])

test_ingles = list()
test_portugues = list()
for x in range(0, len(test)):
  test_ingles.append(test[x][0])
  test_portugues.append(test[x][1])


# prepare training data
trainX = encode_sequences(port_tokenizer, port_length, train_portugues)
trainY = encode_sequences(eng_tokenizer, eng_length, train_ingles)

# prepare validation data
testX = encode_sequences(port_tokenizer, port_length, test_portugues)
testY = encode_sequences(eng_tokenizer, eng_length, test_ingles)

In [0]:
# build NMT model
def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
      model = Sequential()
      model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
      model.add(Dropout(rate=0.5))
      model.add(LSTM(units))
      model.add(Dropout(rate=0.7))
      model.add(RepeatVector(out_timesteps))
      model.add(LSTM(units, return_sequences=True))
      model.add(Dropout(rate=0.5))
      model.add(Dense(out_vocab, activation='softmax'))
      return model


# model compilation
model = define_model(eng_vocab_size, port_vocab_size, eng_length, port_length, 512)

rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')
#model.compile(optimizer='adam', loss='categorical_crossentropy')

filename = 'model3_20_d_30ep_b128.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
# train model
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=30, batch_size=128, validation_split = 0.4,callbacks=[checkpoint], verbose=1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

In [0]:
model = load_model('model3_20_d_20ep_b64.h5')

testZ = testX[:500]
preds = model.predict_classes(testZ.reshape((testZ.shape[0],testZ.shape[1])))

def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

preds_text = []
for i in preds:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j], eng_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))

pred_df = pd.DataFrame({'actual' : test_ingles[:500],'predicted' : preds_text})
# print 15 rows randomly
pred_df.sample(15)