In [2]:
!pip install nlp

Collecting nlp
  Using cached nlp-0.4.0-py3-none-any.whl (1.7 MB)
Collecting pyarrow>=0.16.0
  Downloading pyarrow-9.0.0-cp39-cp39-win_amd64.whl (19.6 MB)
Collecting dill
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp39-cp39-win_amd64.whl (29 kB)
Installing collected packages: xxhash, pyarrow, dill, nlp
Successfully installed dill-0.3.5.1 nlp-0.4.0 pyarrow-9.0.0 xxhash-3.0.0
Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
Collecting pyarrow>=0.16.0
  Downloading pyarrow-9.0.0-cp39-cp39-win_amd64.whl (19.6 MB)
Collecting xxhash
  Using cached xxhash-3.0.0-cp39-cp39-win_amd64.whl (29 kB)
Collecting dill
  Using cached dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
Installing collected packages: xxhash, pyarrow, dill, nlp
Successfully installed dill-0.3.5.1 nlp-0.4.0 pyarrow-9.0.0 xxhash-3.0.0


In [3]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [36]:
dataset = nlp.load_dataset('emotion')
train = dataset['train']
val = dataset['validation']
test = dataset['test']

Using custom data configuration default


In [40]:
def get_tweet(data):
    tweets = [x['text'] for x in data]
    labels = [x['label'] for x in data]
    return tweets, labels
tweets, labels = get_tweet(train)

In [41]:
tweets[0], labels[0]

('i didnt feel humiliated', 'sadness')

In [42]:
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
tokenizer.fit_on_texts(tweets)

In [43]:
tokenizer.texts_to_sequences([tweets[0]])

[[2, 139, 3, 679]]

In [44]:
maxlen=50
def get_sequences(tokenizer, tweets):
    sequences = tokenizer.texts_to_sequences(tweets)
    padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)
    return padded

In [45]:
padded_train_seq = get_sequences(tokenizer, tweets)
padded_train_seq[0]

array([  2, 139,   3, 679,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [46]:
classes = set(labels)
class_to_index = dict((c,i) for i, c in enumerate(classes))
index_to_class = dict((v,k) for k, v in class_to_index.items())
names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])
train_labels = names_to_ids(labels)

In [47]:
print(classes)

{'surprise', 'fear', 'love', 'sadness', 'anger', 'joy'}


In [48]:
class_to_index

{'surprise': 0, 'fear': 1, 'love': 2, 'sadness': 3, 'anger': 4, 'joy': 5}

In [49]:
index_to_class

{0: 'surprise', 1: 'fear', 2: 'love', 3: 'sadness', 4: 'anger', 5: 'joy'}

In [50]:
train_labels[0]

3

In [51]:
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(10000,16,input_length=maxlen),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(
     loss='sparse_categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)

In [52]:
val_tweets, val_labels = get_tweet(val)
val_seq = get_sequences(tokenizer, val_tweets)
val_labels= names_to_ids(val_labels)
h = model.fit(
     padded_train_seq, train_labels,
     validation_data=(val_seq, val_labels),
     epochs=20,
     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [53]:
test_tweets, test_labels=get_tweet(test)
test_seq = get_sequences(tokenizer, test_tweets)
test_labels=names_to_ids(test_labels)
model.evaluate(test_seq, test_labels)



[0.39179304242134094, 0.890999972820282]

In [54]:
model.evaluate(test_seq, test_labels)



[0.39179304242134094, 0.890999972820282]

In [55]:
i = random.randint(0,len(test_labels)-1)
print('Sentence:', test_tweets[i])
print('Emotion:', index_to_class[test_labels[i]])
p = model.predict(np.expand_dims(test_seq[i], axis=0))[0]
print(test_seq[i])
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Predicted Emotion: ', pred_class)

Sentence: i feel like even though things arent quite resolved with my major i have peace about it still
Emotion: joy
[   2    3   14   75  137   89 1222  157  686   25   11 2181    2   21
  849   27   13   72    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Predicted Emotion:  joy


In [61]:
sentence = 'i have to fight to be with you!'
sequence = tokenizer.texts_to_sequences([sentence])
paddedSequence = pad_sequences(sequence, truncating = 'post', padding='post', maxlen=maxlen)
p = model.predict(np.expand_dims(paddedSequence[0], axis=0))[0]
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Sentence:', sentence)
print('Predicted Emotion: ', pred_class)

Sentence: i have to fight to be with you!
Predicted Emotion:  sadness
