In [161]:
import numpy as np
import pandas as pd
import emoji 

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,SimpleRNN,Embedding

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [162]:
data = pd.read_csv('emoji_data.csv',header = None)
data.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [163]:
emoji.emojize(":red_heart:")

'❤️'

In [164]:
emoji_dict = {
    0:":red_heart:",
    1:":baseball:",
    2:":grinning_face_with_big_eyes:",
    3:":disappointed_face:",
    4:":fork_and_knife_plate:"
}

def emoji_to_label(label):
    return  emoji.emojize(emoji_dict[label])


In [165]:
X = data[0].values
Y = data[1].values

### Embedding

In [166]:
file = open('glove.6B.100d.txt','r',encoding='utf8')
content = file.readlines()
file.close()

In [167]:
embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]]=np.array(line[1:],dtype=float)

In [168]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word2index = tokenizer.word_index
word2index

{'i': 1,
 'you': 2,
 'is': 3,
 'the': 4,
 'a': 5,
 'so': 6,
 'am': 7,
 'my': 8,
 'to': 9,
 'this': 10,
 'are': 11,
 'ha': 12,
 'for': 13,
 'she': 14,
 'he': 15,
 'me': 16,
 'not': 17,
 'love': 18,
 'your': 19,
 'want': 20,
 'have': 21,
 'it': 22,
 'got': 23,
 'like': 24,
 'did': 25,
 'baseball': 26,
 'food': 27,
 'was': 28,
 'do': 29,
 'joke': 30,
 'stop': 31,
 'will': 32,
 'miss': 33,
 'life': 34,
 'ball': 35,
 'good': 36,
 'what': 37,
 'go': 38,
 'job': 39,
 'funny': 40,
 'bad': 41,
 'day': 42,
 'great': 43,
 'dinner': 44,
 'that': 45,
 'with': 46,
 'at': 47,
 'of': 48,
 'game': 49,
 'we': 50,
 'again': 51,
 'said': 52,
 'yes': 53,
 'lol': 54,
 'and': 55,
 'down': 56,
 'had': 57,
 'her': 58,
 'fun': 59,
 'smile': 60,
 'lot': 61,
 'working': 62,
 'him': 63,
 'cute': 64,
 'on': 65,
 'lets': 66,
 'messing': 67,
 'us': 68,
 'play': 69,
 'exercise': 70,
 'lost': 71,
 'never': 72,
 'where': 73,
 'can': 74,
 'well': 75,
 'much': 76,
 'valentine': 77,
 'restaurant': 78,
 'awesome': 79,
 'lik

In [169]:
# Convert Y to numeric, remove NaN values, and drop corresponding X values
Y = pd.Series(Y)
Y = pd.to_numeric(Y, errors='coerce')  # Coerce errors to NaN
X = X[Y.notna()]  # Remove the corresponding X values where Y is NaN
Y = Y.dropna().astype(int)  # Remove NaN from Y and convert to integers


In [170]:
Xtokens = tokenizer.texts_to_sequences(X)

In [171]:
Xtokens

[[103, 104, 3, 6, 105],
 [106, 3, 107],
 [1, 7, 108],
 [109, 4, 35],
 [36, 30],
 [37, 3, 19, 110, 26, 49],
 [1, 111, 112],
 [31, 67, 113],
 [1, 20, 114, 27],
 [115, 68, 38, 69, 26],
 [2, 11, 116, 10, 70],
 [117, 50, 71, 51],
 [36, 39],
 [12, 12, 12, 22, 28, 6, 40],
 [1, 32, 21, 5, 118, 119],
 [120, 11, 2, 121, 41],
 [1, 20, 9, 30],
 [1, 72, 52, 53, 13, 10],
 [4, 122, 3, 123],
 [73, 3, 4, 35],
 [1, 7, 124],
 [12, 12, 12, 54],
 [14, 52, 53],
 [15, 23, 5, 125],
 [126, 3, 127, 1, 21],
 [15, 74, 128, 129, 75],
 [1, 18, 9, 4, 130, 55, 131],
 [29, 2, 24, 132],
 [2, 133, 134, 10, 135],
 [1, 24, 19, 136],
 [14, 23, 16, 5, 137],
 [32, 2, 138, 8, 77],
 [2, 139, 4, 140],
 [141, 3, 56, 13, 5, 78],
 [77, 42, 3, 142],
 [43, 6, 79],
 [29, 2, 21, 5, 35],
 [15, 74, 17, 29, 143],
 [15, 80, 26],
 [50, 57, 81, 5, 144, 44, 145],
 [146, 11, 147],
 [15, 3, 5, 36, 148],
 [72, 149, 9, 16, 51],
 [1, 33, 58],
 [27, 3, 34],
 [1, 7, 150, 59],
 [6, 41, 45, 2, 151, 152, 46, 68],
 [29, 2, 20, 9, 153, 16, 13, 44],
 [1,

In [172]:
def get_maxlen(data):
    return max(len(sent) for sent in data)

maxlen = get_maxlen(Xtokens)
print(maxlen)


10


In [173]:
Xtrain = pad_sequences(Xtokens,maxlen=10,padding='post',truncating='post')
Xtrain

array([[103, 104,   3, ...,   0,   0,   0],
       [106,   3, 107, ...,   0,   0,   0],
       [  1,   7, 108, ...,   0,   0,   0],
       ...,
       [ 14,   3,   5, ...,   0,   0,   0],
       [ 14, 310,  26, ...,   0,   0,   0],
       [  1,  24,  22, ...,   0,   0,   0]])

In [174]:
Ytrain = to_categorical(Y)
Ytrain

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0.

In [175]:
print("Shape of Xtrain:", Xtrain.shape)
print("Shape of Ytrain:", Ytrain.shape)


Shape of Xtrain: (182, 10)
Shape of Ytrain: (182, 5)


### Model

In [176]:
word2index

{'i': 1,
 'you': 2,
 'is': 3,
 'the': 4,
 'a': 5,
 'so': 6,
 'am': 7,
 'my': 8,
 'to': 9,
 'this': 10,
 'are': 11,
 'ha': 12,
 'for': 13,
 'she': 14,
 'he': 15,
 'me': 16,
 'not': 17,
 'love': 18,
 'your': 19,
 'want': 20,
 'have': 21,
 'it': 22,
 'got': 23,
 'like': 24,
 'did': 25,
 'baseball': 26,
 'food': 27,
 'was': 28,
 'do': 29,
 'joke': 30,
 'stop': 31,
 'will': 32,
 'miss': 33,
 'life': 34,
 'ball': 35,
 'good': 36,
 'what': 37,
 'go': 38,
 'job': 39,
 'funny': 40,
 'bad': 41,
 'day': 42,
 'great': 43,
 'dinner': 44,
 'that': 45,
 'with': 46,
 'at': 47,
 'of': 48,
 'game': 49,
 'we': 50,
 'again': 51,
 'said': 52,
 'yes': 53,
 'lol': 54,
 'and': 55,
 'down': 56,
 'had': 57,
 'her': 58,
 'fun': 59,
 'smile': 60,
 'lot': 61,
 'working': 62,
 'him': 63,
 'cute': 64,
 'on': 65,
 'lets': 66,
 'messing': 67,
 'us': 68,
 'play': 69,
 'exercise': 70,
 'lost': 71,
 'never': 72,
 'where': 73,
 'can': 74,
 'well': 75,
 'much': 76,
 'valentine': 77,
 'restaurant': 78,
 'awesome': 79,
 'lik

In [177]:
embed_size = 100
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    embed_vector = embeddings[word]
    embedding_matrix[i] = embed_vector


In [178]:
model = Sequential([
    Embedding(input_dim=len(word2index)+1,
              output_dim = embed_size,
              input_length = maxlen,
               weights = [embedding_matrix],
               trainable = False),
    LSTM(units=16,return_sequences=True ),
    LSTM(units=10,return_sequences=True),
    LSTM(units=8),
    Dense(5,activation='softmax')
])

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])




In [179]:
print("Shape of Xtrain:", Xtrain.shape)
print("Shape of Ytrain:", Ytrain.shape)


Shape of Xtrain: (182, 10)
Shape of Ytrain: (182, 5)


In [180]:
model.fit(Xtrain,Ytrain,epochs=100)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.2225 - loss: 1.6075
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3206 - loss: 1.5929 
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3612 - loss: 1.5719 
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3234 - loss: 1.5582 
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3036 - loss: 1.5476 
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3263 - loss: 1.5259 
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3208 - loss: 1.5168 
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3493 - loss: 1.4979 
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x2bce4b355a0>

In [186]:
test = ["i feel good","i feel very bad","i love you"]

test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq,maxlen=maxlen,padding='post',truncating='post')

y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred,axis=1)
 

for i in range(len(test)):
    print(test[i],emoji_to_label(y_pred[i]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 359ms/step
i feel good 😃
i feel very bad 😞
i love you ❤️
