In [65]:
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.utils import to_categorical

In [66]:
df=pd.read_csv('emoji_data.csv',header=None)
df.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [67]:
import emoji # pip install emoji
emoji_dict = {
    0: ":red_heart:",
    1: ":baseball:",
    2: ":grinning_face_with_big_eyes:",
    3: ":disappointed_face:",
    4: ":fork_and_knife_with_plate:"
}

def label_to_emoji(label):
    return emoji.emojize(emoji_dict[label])

In [68]:
label_to_emoji(2)

'😃'

In [69]:
label_to_emoji(1)

'⚾'

In [70]:
X = df[0].values
Y = df[1].values

In [71]:
#download glove https://nlp.stanford.edu/projects/glove/ 
f=open('glove.6B.100d.txt','r', encoding = 'utf8')
para=f.readlines()
f.close()

In [72]:
embedding={}

for sent in para:
    sent=sent.split()
    embedding[sent[0]] = np.array(sent[1:], dtype = float)

In [74]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

In [75]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word2index = tokenizer.word_index
print(word2index)

{'i': 1, 'you': 2, 'is': 3, 'the': 4, 'a': 5, 'so': 6, 'am': 7, 'my': 8, 'to': 9, 'this': 10, 'are': 11, 'ha': 12, 'for': 13, 'she': 14, 'he': 15, 'me': 16, 'not': 17, 'love': 18, 'your': 19, 'want': 20, 'have': 21, 'it': 22, 'got': 23, 'like': 24, 'did': 25, 'baseball': 26, 'food': 27, 'was': 28, 'do': 29, 'joke': 30, 'stop': 31, 'will': 32, 'miss': 33, 'life': 34, 'ball': 35, 'good': 36, 'what': 37, 'go': 38, 'job': 39, 'funny': 40, 'bad': 41, 'day': 42, 'great': 43, 'dinner': 44, 'that': 45, 'with': 46, 'at': 47, 'of': 48, 'game': 49, 'we': 50, 'again': 51, 'said': 52, 'yes': 53, 'lol': 54, 'and': 55, 'down': 56, 'had': 57, 'her': 58, 'fun': 59, 'smile': 60, 'lot': 61, 'working': 62, 'him': 63, 'cute': 64, 'on': 65, 'lets': 66, 'messing': 67, 'us': 68, 'play': 69, 'exercise': 70, 'lost': 71, 'never': 72, 'where': 73, 'can': 74, 'well': 75, 'much': 76, 'valentine': 77, 'restaurant': 78, 'awesome': 79, 'likes': 80, 'such': 81, 'shouting': 82, 'proud': 83, 'bravo': 84, 'two': 85, 'fore

In [76]:
Xt=tokenizer.texts_to_sequences(X)

In [77]:
maxlen=get_maxlen(Xt)
print(maxlen)

10


In [78]:
Xtrain = pad_sequences(Xt, maxlen = maxlen,  padding = 'post', truncating = 'post')

In [79]:
Xtrain.shape

(183, 10)

In [80]:
Ytrain = to_categorical(Y)

In [81]:
Ytrain.shape

(183, 5)

# Model

In [83]:
embed_size=100
embedding_matrix = np.zeros((len(word2index)+1, embed_size))
print(embedding_matrix.shape)
for word, i in word2index.items():
    embed_vector = embedding[word]
    embedding_matrix[i] = embed_vector

(313, 100)


In [84]:
embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.046539,  0.61966 ,  0.56647 , ..., -0.37616 , -0.032502,
         0.8062  ],
       [-0.49886 ,  0.76602 ,  0.89751 , ..., -0.41179 ,  0.40539 ,
         0.78504 ],
       ...,
       [-0.46263 ,  0.069864,  0.69095 , ..., -0.29174 ,  0.32041 ,
         0.21202 ],
       [ 0.073242,  0.11134 ,  0.62281 , ...,  0.53417 , -0.1646  ,
        -0.27516 ],
       [ 0.29019 ,  0.80497 ,  0.31187 , ..., -0.33603 ,  0.45998 ,
        -0.11278 ]])

In [85]:
model = Sequential([
    Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = maxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    LSTM(units =32, return_sequences = True),
    LSTM(units =16, return_sequences = True),
    LSTM(units = 4),
    Dense(5,activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [86]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 100)           31300     
                                                                 
 lstm (LSTM)                 (None, 10, 32)            17024     
                                                                 
 lstm_1 (LSTM)               (None, 10, 16)            3136      
                                                                 
 lstm_2 (LSTM)               (None, 4)                 336       
                                                                 
 dense (Dense)               (None, 5)                 25        
                                                                 
Total params: 51,821
Trainable params: 20,521
Non-trainable params: 31,300
_________________________________________________________________


In [87]:
device = '/device:GPU:0'   #using Gpu for paced training
with tf.device(device):
    model.fit(Xtrain, Ytrain, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [96]:
test = ["I feel good about this", "I  feel bad", "lets play ball"]
test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen = maxlen, padding = 'post', truncating = 'post')

In [97]:
y_pred=model.predict(Xtest)



In [98]:
y_pred

array([[0.06868643, 0.00900121, 0.6156069 , 0.29306695, 0.01363848],
       [0.09808474, 0.01143513, 0.03805294, 0.85148346, 0.00094375],
       [0.1048284 , 0.74118423, 0.00294657, 0.05156041, 0.09948037]],
      dtype=float32)

In [99]:
y_pred = np.argmax(y_pred, axis = 1)

for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

I feel good about this 😃
I  feel bad 😞
lets play ball ⚾
