#### Import libraries

In [5]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
from numpy import argmax
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [6]:
## function ##
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

#### Load clean dictionnary

We trained the model, the model was trained local, there not the fully dictionary has been loaded but only 40000 translation French-English (could be single word to sentence).

In [7]:
#load all the data
all_data = load_clean_sentences("english-french_final.pkl")

In [8]:
# amount of row selected
sentences = 40000

#get the data
data = all_data[:sentences,:]
shuffle(data)

#split train and test

train, test = data[:39000], data[39000:]

In [9]:
#save data once more for safety

save_clean_data(data, 'data_final.pkl')
save_clean_data(train, 'train_final.pkl')
save_clean_data(test, 'test_final.pkl')

Saved: data_final.pkl
Saved: train_final.pkl
Saved: test_final.pkl


#### Pre-processing the data for modelling 

In [10]:
# tokenized data4
from keras.preprocessing.text import Tokenizer

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [11]:
#english tokenizer
eng_tok = create_tokenizer(data[:,0])
eng_voc_size = len(eng_tok.word_index)+1
eng_length = max_length(data[:,0])
print('English Vocabulary Size: %d' % eng_voc_size)
print('English Max Length: %d' % (eng_length))


English Vocabulary Size: 5373
English Max Length: 7


In [12]:
#french tokenizer
fra_tok = create_tokenizer(data[:,1])
fra_voc_size = len(fra_tok.word_index)+1
fra_length = max_length(data[:,1])
print('French Vocabulary Size: %d' % fra_voc_size)
print('French Max Length: %d' % (fra_length))


French Vocabulary Size: 10818
French Max Length: 14


In [13]:
#set path
path = 'C:/Users/viret/OneDrive/IE/Third_Term/NLP/Application/'

#save data
save_clean_data(eng_tok,path + 'eng_tok_final.pkl')
save_clean_data(eng_voc_size,path + 'eng_voc_size_final.pkl')
save_clean_data(eng_length,path + 'eng_length_final.pkl')

save_clean_data(fra_tok,path + 'fra_tok_final.pkl')
save_clean_data(fra_voc_size,path + 'fra_voc_size_final.pkl')
save_clean_data(fra_length,path + 'fra_length_final.pkl')

Saved: C:/Users/viret/OneDrive/IE/Third_Term/NLP/Application/eng_tok_final.pkl
Saved: C:/Users/viret/OneDrive/IE/Third_Term/NLP/Application/eng_voc_size_final.pkl
Saved: C:/Users/viret/OneDrive/IE/Third_Term/NLP/Application/eng_length_final.pkl
Saved: C:/Users/viret/OneDrive/IE/Third_Term/NLP/Application/fra_tok_final.pkl
Saved: C:/Users/viret/OneDrive/IE/Third_Term/NLP/Application/fra_voc_size_final.pkl
Saved: C:/Users/viret/OneDrive/IE/Third_Term/NLP/Application/fra_length_final.pkl


#### Model
Encoder-Decoder LSTM model with 3 layers and drop out of 20%


In [14]:
#encode train and test data
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
trainX = encode_sequences(fra_tok,fra_length, train[:,1])
trainY = encode_sequences(eng_tok,eng_length, train[:,0])
trainY = encode_output(trainY,eng_voc_size)

testX = encode_sequences(fra_tok,fra_length, test[:,1])
testY = encode_sequences(eng_tok,eng_length, test[:,0])
testY = encode_output(testY,eng_voc_size)

In [15]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(Dropout(0.2, noise_shape=None, seed=42))
   # model.add(LSTM(180))
    model.add(RepeatVector(tar_timesteps))
   # model.add(LSTM(n_units, return_sequences=False))
    model.add(LSTM(n_units,return_sequences=True))
    model.add(Dropout(0.2, noise_shape=None, seed=42))

    model.add(LSTM(n_units, return_sequences=True))
    model.add(Dropout(0.2, noise_shape=None, seed=42))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model
 
# define model
model = define_model(fra_voc_size, eng_voc_size, fra_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)

# fit model
filename = path+'model_final.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=40, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 256)           2769408   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 7, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 7, 256)            525312    
_________________________________________________________________
dropout_2 (Dropout)          (None, 7, 256)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 7, 256)            525312    
__________

<keras.callbacks.History at 0x14b2d26d630>

In [13]:
# load save model
model = load_model(path+'model_final.h5')

#### Create function to get back the original nature of the date (meaning text) for the model

In [22]:

# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [23]:

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

#### Model result with BLEU (bilingual evaluation understudy) 

In [16]:

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tok, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [17]:
# test on some training sequences
print('train')
evaluate_model(model, eng_tok, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tok, testX, test)

train
src=[voulezvous rester ], target=[do you want to stay], predicted=[do you want to stay]
src=[ils te craignaient], target=[they feared you], predicted=[they feared you]
src=[je le fais beaucoup], target=[i do that a lot], predicted=[i do it a lot]
src=[vous fiezvous a sa parole ], target=[do you believe him], predicted=[do you see him]
src=[il n'en sait encore rien], target=[he doesn't know yet], predicted=[he doesn't not yet]
src=[ou sontils ], target=[where are they], predicted=[where are they]
src=[peuxtu monter un cheval ], target=[can you ride a horse], predicted=[can you ride a horse]
src=[nous etudions le francais], target=[we study french], predicted=[we study french]
src=[il est capable de courir vite], target=[he can run fast], predicted=[he can run fast]
src=[je n'ai eu aucun doute], target=[i had no doubts], predicted=[i had no doubts]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.084625
BLEU-2: 0.288248
BLEU-3: 0.470625
BLEU-4: 0.531988
test
src=[vous semblez occupees], target=[you seem busy], predicted=[you seem busy]
src=[je ne ronfle pas], target=[i don't snore], predicted=[i didn't gamble]
src=[c'est un etudiant], target=[he is a student], predicted=[he is a student]
src=[pars ], target=[go away], predicted=[go away]
src=[ne vous retournez pas], target=[don't turn around], predicted=[let's die at]
src=[je veux un beignet], target=[i want a donut], predicted=[i want a divorce]
src=[j'ai donne un livre a mary], target=[i gave mary a book], predicted=[i gave a water]
src=[j'allais partir], target=[i was going to go], predicted=[i was to to]
src=[elle a l'air perplexe], target=[she looks confused], predicted=[she looks confused]
src=[puisje manger ce gateau ], target=[can i eat this cake], predicted=[may i eat this cake]
BLEU-1: 0.076723
BLEU-2: 0.273682
BLEU-3: 0.455172
BLEU-4: 0.516901


### model understanding

In [1]:
text = ["tu me manques"]

In [2]:
print(text)

['tu me manques']


In [47]:
testUN = encode_sequences(fra_tok,fra_length, text)
print(testUN)

[[  18   20 1417    0    0    0    0    0    0    0    0    0    0    0]]


In [48]:
#apply model prediction
predict_sequence(model,eng_tok,testUN)

'i miss you'

**Bleu score is good and the translation seems to work fine – of course, since it has been trained locally and with a relatively small dictionary, they will be some mis translation and the sentence are not to elaborated.**