In [None]:
''' Data preprocessing '''

''' Load en-fr paired corpus '''
en_corpus = []
with open('../corpus/europarl-v7.fr-en.en') as o:
    en_corpus = o.readlines()
    
fr_corpus = []
with open('../corpus/europarl-v7.fr-en.fr') as o:
    fr_corpus = o.readlines()
    
''' Tokenization '''
import nltk

en_sentences = []
fr_sentences = []

for i in range(30000, 31000):
    tokenized_sent_en = nltk.word_tokenize(en_corpus[i].lower())
    tokenized_sent_fr = nltk.word_tokenize(fr_corpus[i].lower())

    if len(tokenized_sent_en) == 0 or len(tokenized_sent_fr) == 0:
        print(i)
        continue
    else:
        en_sentences += [tokenized_sent_en]
        fr_sentences += [tokenized_sent_fr]
    
print(en_sentences[0])
print(fr_sentences[0])


''' Build vocabulary list for mapping text to text ID '''
from sklearn.feature_extraction.text import CountVectorizer

en_vocab_list = []
fr_vocab_list = []

for sent in en_sentences:
    for word in sent:
        if word not in en_vocab_list:
            en_vocab_list += [word]
            
for sent in fr_sentences:
    for word in sent:
        if word not in fr_vocab_list:
            fr_vocab_list += [word]


print('en vocabs size:', len(en_vocab_list))
print('fr vocabs size:', len(fr_vocab_list))

vocab_list = en_vocab_list + fr_vocab_list


''' Mapping words in sentences to ID for training '''
''' X for fr, y for en '''
X_test = []
y_test = []

for sent in en_sentences:
    id_sent = []
    for word in sent:
        vocab_id = vocab_list.index(word)
        id_sent += [vocab_id]
    y_test += [id_sent]
    
for sent in fr_sentences:
    id_sent = []
    for word in sent:
        vocab_id = vocab_list.index(word)
        id_sent += [vocab_id]
    X_test += [id_sent]

In [None]:
import tensorflow as tf
from mySeq2Seq import Seq2SeqModel
import utils

checkpoint = '../models/' + 'nmt.ckpt-' + '312'

batch_size = 32
beam_width = 3

infer_graph = tf.Graph()
with infer_graph.as_default():
    model = Seq2SeqModel(
        encoder_num_units = 512, 
        decoder_num_units = 512, 
        embedding_size = 512,
        num_layers = 2,
        vocab_size = 27234, 
        batch_size = batch_size,
        bidirectional = False,
        attention = True,
        beam_search = True,
        beam_width = beam_width,
        mode = "Infer"
    )
    print('model constructed.')
    
    with tf.Session() as sess:
#         loader = tf.train.import_meta_graph(checkpoint + '.meta')
#         loader.restore(sess, checkpoint)
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint)
        
        X, y = utils.input_generator(
            X_test,
            y_test, 
            batch_size)
        
        feed_dict = model.make_infer_inputs(X)
        inf_logits = model.decoder_predictions_inference
        translations = sess.run(inf_logits, feed_dict)
        
        if model.beam_search == True:
            for i, translation in enumerate(translations):
                print(i)
                for b in range(beam_width):
                    sent = ""
                    for words in translation:
                        sent += vocab_list[words[b]] + ' '
                    print(sent, '\n')
        else:
            for translation in translations:
                sent = ""
                for word in translation:
                    sent += vocab_list[word] + ' '
                print(sent, '\n')