In [3]:
import emoji

## Defining the dictionary for 5 emojis used in the dataset

In [38]:
emoji_dictionary={
    "0":":beating_heart:",
    "1":":baseball:",
    "2":":beaming_face_with_smiling_eyes:",
    "3":":downcast_face_with_sweat:",
    "4":":fork_and_knife:"
}

# Importing the dataset

In [42]:
import pandas as pd
import numpy as np

In [103]:
train=pd.read_csv("dataset/train_emoji.csv")
test=pd.read_csv("dataset/test_emoji.csv")

In [104]:
train.head(n=10)

Unnamed: 0,never talk to me again,3,Unnamed: 2,Unnamed: 3
0,I am proud of your achievements,2,,
1,It is the worst day in my life,3,,
2,Miss you so much,0,,[0]
3,food is life,4,,
4,I love you mum,0,,
5,Stop saying bullshit,3,,
6,congratulations on your acceptance,2,,
7,The assignment is too long,3,,
8,I want to go play,1,,[3]
9,she did not answer my text,3,,


In [105]:
train=train.values

In [106]:
#only first two columns contain useful information 
x_train=train[:,0]
y_train=train[:,1]

In [107]:
print(x_train.shape,y_train.shape)

(131,) (131,)


In [108]:
test=test.values
x_test=test[:,0]
y_test=test[:,1]
print(x_test.shape,y_test.shape)

(55,) (55,)


In [109]:
#test the created data 
for i in range(5):
    print(x_train[i],emoji.emojize(emoji_dictionary.get(str(y_train[i]))))

I am proud of your achievements 😁
It is the worst day in my life 😓
Miss you so much 💓
food is life 🍴
I love you mum 💓


In [126]:
#convert y into categorical data
from keras.utils import to_categorical

In [127]:
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)
print(y_train.shape,y_test.shape)

(131, 5) (55, 5)


## Using the glove word embeddings

In [110]:
#Reading from the text file "glove.6B.50d.txt"
f=open("glove.6B.50d.txt",encoding="utf8")

In [111]:
embeddings={}
for line in f:
    data=line.split()
    word=data[0]
    values=np.array(data[1:],dtype="float32")
    embeddings[word]=values
f.close()

In [112]:
#test the created dictionary against some random word
print(embeddings["hello"])

[-0.38497   0.80092   0.064106 -0.28355  -0.026759 -0.34532  -0.64253
 -0.11729  -0.33257   0.55243  -0.087813  0.9035    0.47102   0.56657
  0.6985   -0.35229  -0.86542   0.90573   0.03576  -0.071705 -0.12327
  0.54923   0.47005   0.35572   1.2611   -0.67581  -0.94983   0.68666
  0.3871   -1.3492    0.63512   0.46416  -0.48814   0.83827  -0.9246
 -0.33722   0.53741  -1.0616   -0.081403 -0.67111   0.30923  -0.3923
 -0.55002  -0.68827   0.58049  -0.11626   0.013139 -0.57654   0.048833
  0.67204 ]


In [113]:
print(embeddings["shape"].shape)

(50,)


# Converting sentences into vectors using embeddings dictionary

* We are using a pretrained embedding, therefore no need to create an embedding layer in our model.
* The output of the embedding layer is a 3D volume.
* The output volume is of the shape (batch_size,max_length_of_sentence,size_of_embedding).
* In this model we limit the max_length_of_sentence to some small number say 10 and the size of embedding is 50.

In [114]:
def embedding_output(X,maxlen=10):
    embedding_output=np.zeros((X.shape[0],maxlen,50))
    for i in range(X.shape[0]):
        X[i]=X[i].split()
        for j in range(len(X[i])):
            try:
                embedding_output[i][j]=embeddings[X[i][j].lower()]
            except:
                embedding_output[i][j]=np.zeros((50,))
    return embedding_output

In [115]:
embedded_x_train=embedding_output(x_train)
embedded_x_test=embedding_output(x_test)

In [121]:
print(embedded_x_train.shape,embedded_x_test.shape)

(131, 10, 50) (55, 10, 50)


# Define a simple LSTM model

In [157]:
from keras.models import Sequential
from keras.layers import *

In [194]:
model = Sequential()
model.add(LSTM(128,input_shape=(10,50),return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64,return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(5))
model.add(Activation("softmax"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_28 (LSTM)               (None, 10, 128)           91648     
_________________________________________________________________
dropout_27 (Dropout)         (None, 10, 128)           0         
_________________________________________________________________
lstm_29 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dropout_28 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 5)                 325       
_________________________________________________________________
activation_14 (Activation)   (None, 5)                 0         
Total params: 141,381
Trainable params: 141,381
Non-trainable params: 0
_________________________________________________________________


In [195]:
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['acc'])

In [196]:
from keras.callbacks import ModelCheckpoint

Checkpoint=ModelCheckpoint("best_model.h5",monitor='val_acc',save_best_only=True)

In [197]:
hist=model.fit(embedded_x_train,y_train,epochs=80,batch_size=64,shuffle=True,validation_split=0.2,callbacks=[Checkpoint])

Train on 104 samples, validate on 27 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80


Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [199]:
model.load_weights("best_model.h5")

In [200]:
model.evaluate(embedded_x_test,y_test)



[1.236179076541554, 0.6727272738109935]

### A test accuracy of 67 percent is achieved for a 5 way classification. It is not bad considering. The accuracy could have been much higher if the dataset was larger. Even on a very small dataset our model produced a test accuracy of 67 percent.

In [218]:
#Print some test data and predicted emojis by our model
predict=model.predict_classes(embedded_x_test)

In [219]:
for i in range(10):
    print(" ".join(x_test[i]),emoji.emojize(emoji_dictionary[str(predict[i])]))

he did not answer 😓
he got a raise 😁
she got me a present 💓
ha ha ha it was so funny 😁
he is a good friend 😁
I am upset 😓
We had such a lovely dinner tonight 😁
where is the food 🍴
Stop making this joke ha ha ha 😁
where is the ball ⚾


# :)