## Importing necessary modules

In [1]:
import pandas as pd
import numpy as np
import emoji
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

## Reading pretrained Glove embedding vectors
Vocabulary is created from Glove embedding words

In [2]:
vocab = set()
word2vec_map = {}
word2Ind = {}
Ind2word = {}
with open("../input/glove6b50dtxt/glove.6B.50d.txt", 'r') as f:
    for line in f:
        line = line.strip().split()
        curr_word = line[0]
        vocab.add(curr_word)
        word2vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    idx = 1  #zero is used for PAD
    for word in sorted(vocab):
        word2Ind[word] = idx
        Ind2word[idx] = word
        idx = idx + 1

## Function to convert emoji number to emoji object

In [4]:
def label_to_emoji(label):
    emoji_dictionary = {"0": "\u2764\uFE0F",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

## Loading dataset

In [5]:
data = pd.read_csv("../input/emojifydata/emojifier_dataset.csv")
X = data['sentence']
y = data['emoji']
for i in range(5):
    print(X[i], label_to_emoji(y[i]))

## Split data into training and testing sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
max_len = len(max(X_train, key=len).split()) #maximum sentence length in training set
max_len

## Converting sentences to integers

In [8]:
def sentences_to_indices(X, word2Ind, max_len):
    
    X_indices = [] 
    for i, sentence in enumerate(X):  # loop over training examples
        sentence_words = sentence.lower().split()
        sentence_indices = []
        for word in sentence_words:
            sentence_indices.append(word2Ind[word])
        num_pad = max_len - len(sentence_indices)
        #PADs are added at beginning so that last hidden state of LSTM is more meaningful
        sentence_indices =  num_pad*[0] + sentence_indices #zero represents PAD
        X_indices.append(sentence_indices)    
    return np.array(X_indices)

In [11]:
from tensorflow.keras.utils import to_categorical
X_train_indices = sentences_to_indices(X_train, word2Ind, max_len)
y_train_oh = to_categorical(y_train, num_classes=5)

In [12]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Activation, Bidirectional

## Setting pretrained Glove embedding to keras Embedding layer

In [13]:
def pretrained_embedding_layer(word2vec_map, word2Ind):
    '''Keras embedding layer is loaded with pretrained Glove vectors'''
    vocab_size = len(word2Ind) + 1   # adding 1 to fit Keras embedding (requirement)
    emb_dim = len(word2vec_map["a"]) 
    
    emb_matrix = np.zeros((vocab_size, emb_dim))
    for word, idx in word2Ind.items():
        emb_matrix[idx, :] = word2vec_map[word]

    embedding_layer = Embedding(vocab_size, emb_dim, trainable = False)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,))
    # Set the weights of the embedding layer to the embedding matrix. This layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

## Defining model

In [23]:
def emojify_model(max_len, word2vec_map, word2Ind):

    sentence_indices = Input(shape = (max_len, ), dtype = 'int32')
    embeddings = pretrained_embedding_layer(word2vec_map, word2Ind)(sentence_indices)
    X = LSTM(128, return_sequences = True)(embeddings)
    X = LSTM(128, return_sequences = False)(X) #outputs last hidden state
    X = Dropout(0.5)(X)
    X = Dense(5)(X)  #output logits for 5 different emojies
    output = Activation('softmax')(X)
    
    model = Model(inputs = sentence_indices, outputs = output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [34]:
model = emojify_model(max_len, word2vec_map, word2Ind)
model.summary()

## Training model with training data

In [35]:
model.fit(X_train_indices, y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

## Evaluating model performance

In [37]:
X_test_indices = sentences_to_indices(X_test, word2Ind, max_len)
y_test_oh = to_categorical(y_test, num_classes=5)
loss, accuracy = model.evaluate(X_test_indices, y_test_oh)
print()
print("Test accuracy = ", accuracy)

In [40]:
sentence = 'i am not feeling happy'
test_sentence = np.array([sentence])
test_sentence_indices = sentences_to_indices(test_sentence, word2Ind, max_len)
test_pred = model.predict(test_sentence_indices)
print(test_sentence[0] +' '+  label_to_emoji(np.argmax(test_pred)))