#### Import libraries

In [1]:
import numpy as np
from pickle import load
from pickle import dump
import re
from numpy.random import rand
from numpy.random import shuffle
from numpy import argmax
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from emo_utils import *
import emoji

Using TensorFlow backend.


#### Create functions

In [2]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# tokenized data
from keras.preprocessing.text import Tokenizer

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

def sentences_to_indices(X, word_to_index, max_len):
   
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = (X[i].lower()).split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = word_to_index[w]
            # Increment j to j + 1
            j = j+1
            
    
    return X_indices

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def application(text):
    try:
        
        text_enc = encode_sequences(fra_tok,fra_length,text)
        prediction = predict_sequence(seq2seq,eng_tok, text_enc)
        prediction = decontracted(prediction) 
        print("Transaltion:", ' '.join(map(str, text)), "==>",prediction)
        prediction = np.array([prediction])
        X_test_indices = sentences_to_indices(prediction, word_to_index, maxLen)
        print("Adding emoji ==>", prediction[0] +' '+  label_to_emoji(np.argmax(model_1.predict(X_test_indices))))
    except:
        print("No emoji for the moment")

def application2(text):
    try:
        text_enc = encode_sequences(fra_tok,fra_length,text)
        prediction = predict_sequence(seq2seq,eng_tok, text_enc)
        prediction = decontracted(prediction) 
        print("Transaltion:", ' '.join(map(str, text)), "==>",prediction)
        prediction = np.array([prediction])
        X_test_indices = sentences_to_indices(prediction, word_to_index, maxLen)
        L = np.argsort(-(model_1.predict((X_test_indices))), axis=1).reshape(5)
        print("Adding emojis ==>", prediction[0] +' '+   label_to_emoji(L[0]) + ' ' + label_to_emoji(L[1]) + ' ' + label_to_emoji(L[2]))
    except:
        print("No emoji for the moment")
    


#### load necessary data and models

In [3]:
fra_tok = load_clean_sentences('fra_tok_final.pkl')
fra_length = load_clean_sentences('fra_length_final.pkl')
eng_tok = load_clean_sentences('eng_tok_final.pkl')
eng_lenght = load_clean_sentences('eng_length_final.pkl')
word_to_index = load_clean_sentences('word_to_index_final.pkl')
maxLen = load_clean_sentences('maxLen_final.pkl')

In [4]:
seq2seq = load_model('model_final.h5') #seq2seq model
model_1 = load_model('emojis_final.h5') #lstm from emoji file

# Application

#### Input

In [5]:
text = ["j'adore le fromage"],["Tu me manques"],['ne soyez pas en colere'],["j'aime le football"],["J'aime le soleil"],["j'ai faim"]

#### Tanslation part-1

In [6]:
for i in range(0,len(text)):
    application(text[i])
    print("------")

Transaltion: j'adore le fromage ==> i love sauerkraut
Adding emoji ==> i love sauerkraut ❤️
------
Transaltion: Tu me manques ==> i miss you
Adding emoji ==> i miss you ❤️
------
Transaltion: ne soyez pas en colere ==> do not be angry angry
Adding emoji ==> do not be angry angry 😞
------
Transaltion: j'aime le football ==> i like soccer
Adding emoji ==> i like soccer ⚾
------
Transaltion: J'aime le soleil ==> i like the sun
Adding emoji ==> i like the sun 😄
------
Transaltion: j'ai faim ==> i am hungry hungry
Adding emoji ==> i am hungry hungry 🍴
------


**When there is small simple sentence to translate from French to English, the model preformed well 3 out of 6 sentence are 100% correctly translated – 2 are properly translated even if the last word is repeated twice (so far I could not really find out why) and the last sentence is somehow correct, but it should have been “cheese” instead of sauerkraut – most probably the issue come from the very small dictionary used to train the model.**

**There is a lot of room for improvement, such as incorporating GLOVE or World2Vec to the embedded layer of the seq2seq model – could also have done similar process with a French version – increase the dictionary size and train the model with GPU instead of locally with CPU.
Link of our presentation and use-cases: **https://slides.com/anniepi/deck#/11