## Importing necessary modules

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
%cd drive/MyDrive/Github/Natural-Language-Processing/Sentence Emojifier

/content/drive/MyDrive/Github/Natural-Language-Processing/Sentence Emojifier


In [4]:
import pandas as pd
import numpy as np
import emoji
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

## Reading pretrained Glove embedding vectors
Vocabulary is created from Glove embedding words

In [12]:
!unzip "glove.6B.50d.txt.zip"

Archive:  glove.6B.50d.txt.zip
  inflating: glove.6B.50d.txt        


In [13]:
vocab = set()
word2vec_map = {}
word2Ind = {}
Ind2word = {}
with open("glove.6B.50d.txt", 'r') as f:
    for line in f:
        line = line.strip().split()
        curr_word = line[0]
        vocab.add(curr_word)
        word2vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    idx = 1  #zero is used for PAD
    for word in sorted(vocab):
        word2Ind[word] = idx
        Ind2word[idx] = word
        idx = idx + 1

## Function to convert emoji number to emoji object

In [14]:
def label_to_emoji(label):
    emoji_dictionary = {"0": "\u2764\uFE0F",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

## Loading dataset

In [15]:
data = pd.read_csv("emojifier_dataset.csv")
X = data['sentence']
y = data['emoji']
for i in range(5):
    print(X[i], label_to_emoji(y[i]))

French macaroon is so tasty 🍴
work is horrible 😞
I am upset 😞
throw the ball ⚾
Good joke 😄


## Split data into training and testing sets

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [27]:
max_len = len(max(X_train, key=len).split()) #maximum sentence length in training set
max_len

10

## Converting sentences to integers

In [28]:
def sentences_to_indices(X, word2Ind, max_len):
    
    X_indices = [] 
    for i, sentence in enumerate(X):  # loop over training examples
        sentence_words = sentence.lower().split()
        sentence_indices = []
        for word in sentence_words:
            sentence_indices.append(word2Ind[word])
        num_pad = max_len - len(sentence_indices)
        #PADs are added at beginning so that last hidden state of LSTM is more meaningful
        sentence_indices =  num_pad*[0] + sentence_indices #zero represents PAD
        X_indices.append(sentence_indices)    
    return np.array(X_indices)

In [29]:
from tensorflow.keras.utils import to_categorical
X_train_indices = sentences_to_indices(X_train, word2Ind, max_len)
y_train_oh = to_categorical(y_train, num_classes=5)

In [30]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Activation, Bidirectional

## Setting pretrained Glove embedding to keras Embedding layer

In [31]:
def pretrained_embedding_layer(word2vec_map, word2Ind):
    '''Keras embedding layer is loaded with pretrained Glove vectors'''
    vocab_size = len(word2Ind) + 1   # adding 1 to fit Keras embedding (requirement)
    emb_dim = len(word2vec_map["a"]) 
    
    emb_matrix = np.zeros((vocab_size, emb_dim))
    for word, idx in word2Ind.items():
        emb_matrix[idx, :] = word2vec_map[word]

    embedding_layer = Embedding(vocab_size, emb_dim, trainable = False)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,))
    # Set the weights of the embedding layer to the embedding matrix. This layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

## Defining model

In [86]:
def emojify_model(max_len, word2vec_map, word2Ind):

    sentence_indices = Input(shape = (max_len, ), dtype = 'int32')
    embeddings = pretrained_embedding_layer(word2vec_map, word2Ind)(sentence_indices)
    X = LSTM(128, return_sequences = True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences = False)(X) #outputs last hidden state
    X = Dropout(0.5)(X)
    X = Dense(5)(X)  #output logits for 5 different emojies
    output = Activation('softmax')(X)
    
    model = Model(inputs = sentence_indices, outputs = output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [87]:
model = emojify_model(max_len, word2vec_map, word2Ind)
model.summary()

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 10)]              0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 10, 50)            20000050  
_________________________________________________________________
lstm_23 (LSTM)               (None, 10, 128)           91648     
_________________________________________________________________
dropout_18 (Dropout)         (None, 10, 128)           0         
_________________________________________________________________
lstm_24 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_19 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 5)                 645

## Training model with training data

In [88]:
model.fit(X_train_indices, y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f49f71d0550>

## Evaluating model performance

In [89]:
X_test_indices = sentences_to_indices(X_test, word2Ind, max_len)
y_test_oh = to_categorical(y_test, num_classes=5)
loss, accuracy = model.evaluate(X_test_indices, y_test_oh)
print()
print("Test accuracy = ", accuracy)


Test accuracy =  0.739130437374115


## Predicting custom sentence emoji


In [90]:
sentence = 'i am not feeling happy'
test_sentence = np.array([sentence])
test_sentence_indices = sentences_to_indices(test_sentence, word2Ind, max_len)
test_pred = model.predict(test_sentence_indices)
print(test_sentence[0] +' '+  label_to_emoji(np.argmax(test_pred)))

i am not feeling happy 😞
