In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from pickle import load
import numpy as np
from numpy.random import rand
from numpy.random import shuffle

In [None]:
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.losses import *
from tensorflow.keras import optimizers

In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
!wget http://www.manythings.org/anki/spa-eng.zip

--2020-06-28 18:58:41--  http://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 172.67.173.198, 104.24.108.196, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4781548 (4.6M) [application/zip]
Saving to: ‘spa-eng.zip’


2020-06-28 18:58:41 (12.4 MB/s) - ‘spa-eng.zip’ saved [4781548/4781548]



In [None]:
!unzip spa-eng.zip

Archive:  spa-eng.zip
  inflating: _about.txt              
  inflating: spa.txt                 


In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t')[:2] for line in  lines]
	return pairs
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [None]:
# load dataset
filename = '/content/spa.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-spanish.pkl')
# spot check
for i in range(10):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-spanish.pkl
[go] => [ve]
[go] => [vete]
[go] => [vaya]
[go] => [vayase]
[hi] => [hola]
[run] => [corre]
[run] => [corran]
[run] => [corra]
[run] => [corred]
[run] => [corred]


In [None]:
for i in range(10):
	print(clean_pairs[i])

['go' 've']
['go' 'vete']
['go' 'vaya']
['go' 'vayase']
['hi' 'hola']
['run' 'corre']
['run' 'corran']
['run' 'corra']
['run' 'corred']
['run' 'corred']


In [None]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

#EVALUATION FUNCTIONS
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
  predict = model.predict(source, verbose=0)[0]
  integers = []
  for pred in predict:
    integers.append(np.argmax(pred))
  target = []
  for i in integers:
    word = word_for_id(i, tokenizer)
    if word is None:
      break
    target.append(word)
  return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
  actual = []
  predicted = []
  b1=0
  b2=0
  b3=0
  b4=0
  count = 0
  for i, source in enumerate(sources):    # translate encoded source text
    source = source.reshape((1, source.shape[0]))
    translated = predict_sequence(model, tokenizer, source)
    raw_src, raw_tgt = raw_dataset[i]
    if i<10:
      print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_tgt, translated))
      count = count+1
    else:
      break
    actual.append([raw_tgt.split()])
    predicted.append(translated.split())
    b1=b1+ corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
    b2=b2+  corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
    b3=b3+ corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0))
    b4=b4+ corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
  print(b1/count,b2/count,b3/count,b4/count)
    

In [None]:
# load dataset
raw_dataset = load_clean_sentences('english-spanish.pkl')
# reduce dataset size
n_sentences = 100000
dataset = raw_dataset[:n_sentences, :]
remove_ind =[]

# choosing max length of input and target
en_length = 8
es_length  =8
for i in range(0,n_sentences):
  a = dataset[i,0].split()[:en_length]
  a = " ".join(e for e in a)
  dataset[i,0] = a
 
  a = dataset[i,1].split()[:es_length]
  a = " ".join(e for e in a)
  dataset[i,1] = a
# random shuffle
shuffle(dataset)
# # split into train/test
train, test = dataset[:49000], dataset[49000:]
# save
save_clean_data(dataset, 'english-spanish-both.pkl')
save_clean_data(train, 'english-spanish-train.pkl')
save_clean_data(test, 'english-spanish-test.pkl')

Saved: english-spanish-both.pkl
Saved: english-spanish-train.pkl
Saved: english-spanish-test.pkl


In [None]:
dataset = load_clean_sentences('english-spanish-both.pkl')
train = load_clean_sentences('english-spanish-train.pkl')
test = load_clean_sentences('english-spanish-test.pkl')

In [None]:
dataset[56,0]

'i dont follow'

In [None]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
es_tokenizer = create_tokenizer(dataset[:, 1])
es_vocab_size = len(es_tokenizer.word_index) + 1
es_length = max_length(dataset[:, 1])
print('Spanish Vocabulary Size: %d' % es_vocab_size)
print('Spanish Max Length: %d' % (es_length))

English Vocabulary Size: 10807
English Max Length: 8
Spanish Vocabulary Size: 20323
Spanish Max Length: 8


In [None]:
# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(es_tokenizer, es_length, train[:, 1])
#trainY = encode_output(trainY, es_vocab_size)
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(es_tokenizer, es_length, test[:, 1])
#testY = encode_output(testY, es_vocab_size)

In [None]:
import tensorflow as  tf

In [None]:
model= load_model('/content/drive/My Drive/Colab Notebooks/machine_translation_final.h5')

In [None]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding_en (Embedding)        (None, 8, 100)       1080700     input_7[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, 300), (None, 481200      embedding_en[0][0]               
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 7)]          0                                            
____________________________________________________________________________________________

In [None]:
################  MAIN MODEL CODE  ########################################3
# def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units,dropout_rate):
#   x_input1 = Input(shape=(src_timesteps,))
#   x_input2 = Input(shape=(tar_timesteps-1,)) 

#   x=Embedding(input_dim=src_vocab,output_dim=100,embeddings_initializer="uniform",name = "embedding_en")(x_input1)
#   x1=Embedding(input_dim=tar_vocab,output_dim=300,embeddings_initializer="uniform",name = "embedding_es")(x_input2)
#   x_enc,state_h,state_c = LSTM(n_units,return_sequences=True,return_state = True,recurrent_dropout = dropout_rate)(x)
#   enc_state = [state_h,state_c]


#   x=Concatenate(axis=1)([x_enc,x1])
#   x = LSTM(n_units,return_sequences=True,dropout=dropout_rate)(x)
#   # # x = Bidirectional(LSTM(256,return_sequences=True))(x)
#   # # x = LSTM(256,return_sequences=True,dropout=0.1)(x)
#   x = Dense(tar_vocab,activation='softmax')(x)
#   main_model = Model([x_input1,x_input2],x)

#   return main_model

# n_units = 300
# dropout_rate = 0.3
# model1 = create_model(eng_vocab_size, es_vocab_size, eng_length, es_length, n_units,dropout_rate)
# model1.summary()

In [None]:
trainX.shape , trainY.shape

((49000, 8), (49000, 8))

In [None]:
############### TO TRAIN THE MODEL  #######################################

# %cd /content/
# version = "100_drop_0.2"
# filename = 'model'+str(n_units)+'_'+str(version)+'.h5'
# checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# los = SparseCategoricalCrossentropy()
# opt = optimizers.Adam(lr=0.001)
# opt1 = optimizers.RMSprop(0.5)
# # model = load_model('model200_bi_100_drop_0.h5')
# # model = leave_embed(model)
# model.compile(loss=los,optimizer=opt,metrics=['accuracy'])

# model.fit([trainX,trainY[:,0:en_length-1]], trainY, epochs=50, batch_size=256, validation_data=([testX,testY[:,0:en_length-1]], testY), verbose=1)

In [None]:
sam=20
for j in range(0,40,5):
  sam=sam+j
  x_input11=testX[sam].reshape(1,en_length)
  x_input21=np.zeros((1,en_length-1)) 
  for i in range(0,en_length-1):
    output = model.predict([x_input11,x_input21])
    out1=np.argmax(output,axis=2)
    a = out1[0,i]
    if i !=7:
      x_input21[0,i]=a
  prediction = x_input21[0].tolist()
  prediction.append(a)
  print('\n')
  print(" predicted spanish:  ",end="")
  for w in prediction:
    try:
      print(es_tokenizer.index_word[w],end=" ")
    except:
      print("-",end=" ")
  print('\n'+" actual spanish:  ",end="")
  for i in testY[sam]:
    try:
      print(es_tokenizer.index_word[i],end=" ")
    except: 
      print("-",end=" ")

  print('\n'+" actual english:  ",end="")
  for i in testX[sam]:
    try:
      print(eng_tokenizer.index_word[i],end=" ")
    except: 
      print("-",end=" ")




 predicted spanish:  el la ensenarias - - - - - 
 actual spanish:  el salio de la habitacion - - - 
 actual english:  he left the room - - - - 

 predicted spanish:  el profesor se saltar al amamos - - 
 actual spanish:  el asesino confeso su crimen - - - 
 actual english:  the murderer confessed his crime - - - 

 predicted spanish:  ella se fue de la habil en en 
 actual spanish:  ella fallecio tranquilamente mientras dormia - - - 
 actual english:  she passed away peacefully in her sleep - 

 predicted spanish:  yo estaba tan amistad - - - - 
 actual spanish:  me estaba duchando hace un momento - - 
 actual english:  i was showering a moment ago - - 

 predicted spanish:  tu verdad lo que te gusta hacer hacer 
 actual spanish:  se supone que teneis que hacerlo vosotras - 
 actual english:  youre supposed to do that yourselves - - 

 predicted spanish:  verdad el mensaje del gran - - - 
 actual spanish:  eres la peor mentirosa del mundo - - 
 actual english:  youre the worst liar i

In [None]:
sam = 0
b1=0
b2=0
b3=0
b4=0

min_b1=2
min_b2=2
min_b3=2
min_b4=2

count = 0
check=0
for j in range(0,100):
  count = count+1
  sam =sam +20
  actual=[]
  predicted =[]


  x_input11=testX[sam].reshape(1,en_length)
  x_input21=np.zeros((1,en_length-1)) 
  for i in range(0,en_length-1):
    output = model.predict([x_input11,x_input21])
    out1=np.argmax(output,axis=2)
    a = out1[0,i]
    if i !=7:
      x_input21[0,i]=a
  prediction = x_input21[0].tolist()
  prediction.append(a)


  # print(" predicted spanish:  ",end="")
  for w in prediction:
    try:
      # print(es_tokenizer.index_word[w],end=" ")
      predicted.append(es_tokenizer.index_word[w])
    except:
      predicted.append('-')
      #  print("-",end=" ")
  # print('\n'+" actual spanish:  ",end="")
  for i in testY[sam]:
    try:
      # print(es_tokenizer.index_word[i],end=" ")
      actual.append(es_tokenizer.index_word[i])
    except: 
      actual.append('-')
      # print("-",end=" ")

  # print('\n'+" actual english:  ",end="")
  # for i in testX[sam]:
  #   try:
  #     # print(eng_tokenizer.index_word[i],end=" ")
  #   except: 
  #     pass
  #     # print("-",end=" ")
  try:
    che = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
    if (che<min_b1):
      min_b1 =che
    b1=b1+ che

    che = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
    if (che<min_b2):
      min_b2 =che
    b2=b2+ che

    che = corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0))
    if (che<min_b3):
      min_b3 =che
    b3=b3+ che


    che = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
    if (che<min_b4):
      min_b4 =che
    b4=b4+ che

    check= check+1
  except:
    pass
print("BLEU Score:  ")
print(b1/check,b2/check,b3/check,b4/check)


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU Score:  
0.45008423882295484 0.6479264710386536 0.7444514531535646 0.7970899281491551


In [None]:
print("Minimum BLEU score:  ")
print(min_b1,min_b2,min_b3,min_b4)

Minimum BLEU score:  
0.047619047619047616 0.2182178902359924 0.3661572458236839 0.4671379777282001
