<a href="https://colab.research.google.com/github/zahra-zarrabi/Emoji_text_classification_RNN/blob/main/EmojiFier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import required packages

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Dropout, LSTM ,GRU
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Dataset EMOJISET
Tiny dataset (X, Y) where:
- X contains 132 sentences (strings)
- Y contains a integer label between 0 and 4 corresponding to an emoji for each sentence
<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/data_set.png?raw=1\" 
style="width:700px;height:300px;">


In [2]:
def read_csv(filename):
  data_frame = pd.read_csv(filename)
  X = np.asarray(data_frame['sentence'])
  Y = np.asarray(data_frame['label'], dtype=int)
  return X, Y

In [4]:
X_train, Y_train = read_csv('/content/drive/MyDrive/Emoji_Text_Classification/train.csv')
X_test, Y_test = read_csv('/content/drive/MyDrive/Emoji_Text_Classification/test.csv')

In [8]:
def label_to_emoji(label):
  emojies=["❤️",'⚽','😂','😔','🍽️']
  return emojies[label]

In [9]:
X_train[3],label_to_emoji(Y_train[3])

('Miss you so much', '❤️')

In [10]:
# the largest sentence by word count
max_len = len(max(X_train,key=len).split())
max_len

10

In [11]:
# convert labels to one hot
Y_train_oh = tf.keras.utils.to_categorical(Y_train, 5)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, 5)

In [12]:
X_train[3],Y_train_oh[3]

('Miss you so much', array([1., 0., 0., 0., 0.], dtype=float32))

In [13]:
# Read feature vectors and save them
# the word comes first, and then the feature vectors(each word is in one line)
def read_glove_vectors(glove_file):
  f = open(glove_file , encoding = 'utf8')
  words = set()
  words_to_vec = dict()
  for line in f:
    line = line.strip().split()
    word = line[0]
    vec = line[1:]
    words.add(word)
    words_to_vec[word] = np.array(vec, dtype=np.float64)
  return words_to_vec

In [None]:
# Download and extract glove.6B for feature vectors 
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glov.6B

In [14]:
words_to_vec = read_glove_vectors('/content/drive/MyDrive/glove/glove.6B.50d.txt')

In [15]:
words_to_vec['flower']

array([ 0.075439 ,  1.2659   , -1.3179   ,  0.11341  ,  1.4513   ,
        0.17337  , -0.56265  , -1.0706   ,  0.54898  ,  0.30163  ,
       -0.11471  ,  0.38498  ,  0.9205   , -0.2491   ,  0.3308   ,
        0.060113 , -0.0068846,  0.086864 , -0.20535  , -0.86098  ,
        0.10007  , -0.75486  ,  0.48225  , -0.33253  , -0.23791  ,
        0.17345  ,  0.49777  ,  0.88761  ,  0.089471 , -0.56217  ,
        1.8535   , -0.0055493,  0.45845  ,  0.53943  ,  0.3247   ,
        0.43479  , -0.027253 ,  0.44744  , -0.27514  , -0.016152 ,
       -0.51024  , -0.10113  , -0.80985  , -0.31571  ,  1.5817   ,
        0.2105   , -0.1844   , -1.7266   ,  0.092685 , -0.55696  ])

## Emojifier_V1
Each word has some feature, and in Emojifier-V1 we want to classify sentences using multilayer perceptron:
- We get the average of words in each sentence and then forward it to the multilayer perceptron with 50 input neurons(each word has 50 features, then the average of words in the sentence has 50 features) and an output layer of softmax with 5 neurons.
- For feature vectors, we can get from this link: http://nlp.stanford.edu/data/glove.6B.zip",
<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/image_1.png?raw=1\" style="width:900px;height:300px;\">


In [16]:
#  Convert sentences to the average of the word vectors
def sentence_to_avg(sentence):
  words = sentence.lower().split()
  sum_vectors = np.zeros((50,))
  for w in words:
    sum_vectors += words_to_vec[w]
  avg_vectors = sum_vectors / len(words)
  return avg_vectors

In [17]:
# average of all sentences
X_train_avg = []
for i in range(X_train.shape[0]):
  X_train_avg.append(sentence_to_avg(X_train[i]))
X_train_avg = np.array(X_train_avg) 

In [18]:
X_train_avg.shape

(132, 50)

In [19]:
class Emoji_Net_V1(Model):
  def __init__(self):
    super().__init__()
    self.dense = Dense(5, input_shape=(50,), activation='softmax')

  def call(self, x):
    x = self.dense(x)
    return x

In [36]:
model_v1 = Emoji_Net_V1()

In [37]:
model_v1.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])


In [38]:
model_v1.fit(X_train_avg, Y_train_oh, epochs=400, shuffle=True)


Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

<keras.callbacks.History at 0x7f4dac410d50>

In [40]:
X_test_avg = []
for i in range(X_test.shape[0]):
    X_test_avg.append(sentence_to_avg(X_test[i]))

X_test_avg = np.array(X_test_avg)
model_v1.evaluate(X_test_avg, Y_test_oh)



[0.6457489728927612, 0.8392857313156128]

In [24]:
X_me = np.array(["not sad", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy and funny"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_avg = []

for x in X_me:
    X_me_avg.append(sentence_to_avg(x))

X_me_avg = np.array(X_me_avg)
pred = model_v1.predict(X_me_avg)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))

not sad 😔
i adore you ❤️
i love you ❤️
funny lol 😂
lets play with a ball ⚽
food is ready 🍽️
not feeling happy and funny 😂


## Emojifier-V2: Using RNNs:
Let's build an LSTM model that takes as input word sequences. This model will be able to take word ordering into account. Emojifier-V2 will continue to use pre-trained word embeddings to represent words, but will feed them into an LSTM, whose job it is to predict the most appropriate emoji. \n",
Run the following cell to load the Keras packages.
<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/emojifier-v2.png?raw=1\" style="width:700px;height:400px;\"> <caption><center> **Figure 3**: Emojifier-V2. A 2-layer LSTM sequence classifier. </center></caption>


In [30]:
def convert_sentence_to_embeddings(X):
  emb_matrix = np.zeros((X.shape[0], #size of dataset
                        10, # len of longest sentence
                        50 # size of emmbedings vector
                        ))
  for i in range(X.shape[0]):
      words = X[i].lower().split()
      for j in range(len(words)):
          emb_matrix[i,j,:] = words_to_vec[words[j]]
  return emb_matrix

<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/embedding1.png?raw=1\" style="width:700px;height:250px;\">
<caption> **Figure 4**: Embedding layer. This example shows the propagation of two examples through the embedding layer. Both have been zero-padded to a length of `max_len=5`. The final dimension of the representation is  `(2,max_len,50)` because the word embeddings we are using are 50 dimensional.</caption>

In [31]:
x_me = np.array(["funny lol","lets play baseball" , "food is ready for you"])
print(convert_sentence_to_embeddings(x_me))

[[[-0.014547 -0.20208  -0.75278  ... -0.13429   0.21133   1.5368  ]
  [-0.54289   0.053743 -0.46978  ...  0.20745  -0.074958  0.080575]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.30423  -0.24405   1.0303   ... -0.43296  -0.096168  0.43463 ]
  [-0.73571   0.19937  -0.89408  ... -0.075279 -0.44448   0.47437 ]
  [-1.9327    1.0421   -0.78515  ...  0.55667  -0.70315   0.17157 ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.47222  -0.44545  -0.51833  ...  0.34932   0.33934   0.25499 ]
  [ 0.6185    0.64254  -0.46552  ... -0.27557   0.30899   0.48497 ]
  [ 0.36825  -0.20512   0.36656 

In [32]:
X_train_emb = convert_sentence_to_embeddings(X_train)
X_train_emb.shape

(132, 10, 50)

In [55]:
class Emoji_Net_V2(Model):
    def __init__(self):
        super().__init__()
        
        self.lstm_1 = LSTM(128, return_sequences=True)
        self.dropout_1 = Dropout(0.5)
        self.lstm_2 = LSTM(128)
        self.dropout_2 = Dropout(0.5)
        self.dense = Dense(5, activation='softmax')

    def call(self, x):
        x = self.lstm_1(x)
        x = self.dropout_1(x)
        x = self.lstm_2(x)
        x = self.dropout_2(x)
        x = self.dense(x)
        return x

In [62]:
model_v2 = Emoji_Net_V2()
model_v2.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
model_v2.fit(X_train_emb, Y_train_oh, epochs = 50, batch_size = 16, shuffle = True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f4d9cc59950>

In [72]:
model_v2.save('/content/drive/MyDrive/glove/model_v2')



In [63]:
X_test_emb = convert_sentence_to_embeddings(X_test)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, 5)
model_v2.evaluate(X_test_emb ,Y_test_oh)



[0.7699219584465027, 0.8571428656578064]

In [66]:
X_me = np.array(["not sad", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy","not good"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_emb = convert_sentence_to_embeddings(X_me)

pred = model_v2.predict(X_me_emb)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))

not sad 😔
i adore you ❤️
i love you ❤️
funny lol 😂
lets play with a ball ⚽
food is ready 🍽️
not feeling happy 😔
not good 😔


## 3

In [102]:
class Emoji_Net_V3(Model):
    def __init__(self):
        super().__init__()
        
        self.lstm_1 = LSTM(64, return_sequences=True)
        self.dropout_1 = Dropout(0.5)
        self.lstm_2 = LSTM(32)
        self.dropout_2 = Dropout(0.5)
        self.dense = Dense(5, activation='softmax')

    def call(self, x):
        x = self.lstm_1(x)
        x = self.dropout_1(x)
        x = self.lstm_2(x)
        x = self.dropout_2(x)
        x = self.dense(x)
        return x

In [105]:
model_v3 = Emoji_Net_V3()
model_v3.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
model_v3.fit(X_train_emb, Y_train_oh, epochs = 50, batch_size = 16, shuffle = True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f4d878cc550>

In [106]:
X_test_emb = convert_sentence_to_embeddings(X_test)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, 5)
model_v3.evaluate(X_test_emb ,Y_test_oh)



[0.7456966042518616, 0.8035714030265808]

In [107]:
X_me = np.array(["not sad", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy","not good"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_emb = convert_sentence_to_embeddings(X_me)

pred = model_v3.predict(X_me_emb)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))



not sad 😔
i adore you ❤️
i love you ❤️
funny lol 😂
lets play with a ball ⚽
food is ready 🍽️
not feeling happy 😔
not good 😔
