In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('datasets/text_emotion.csv')
df

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@JohnLloydTaylor
39996,1753919001,love,drapeaux,Happy Mothers Day All my love
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [3]:
dfClean = df[["sentiment","content"]]
dfClean.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [4]:
#Define vocabulary size
#from keras.preprocessing.text import Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(dfClean["content"])
vocabulary_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(dfClean["content"])

In [5]:
vocabulary_size

48998

In [6]:
#from keras.preprocessing.sequence import pad_sequences
sequence_length = max([len(sequence) for sequence in sequences])
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=sequence_length, padding='pre')

In [7]:
#from keras.utils import to_categorical
x = sequences

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
sentiment_encoded = label_encoder.fit_transform(dfClean['sentiment'])
y = tf.keras.utils.to_categorical(sentiment_encoded)

In [8]:
print(x.shape,y.shape)

(40000, 37) (40000, 13)


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)

In [13]:
model = tf.keras.models.Sequential()

In [14]:
#1. An embedding layer with the following parameters:
    #The input dimension is vocabulary_size
    #The output dimension is 10
    #The input length is sequence_length
model.add(tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=10, input_length=sequence_length))

#2. An LSTM layers with 128 and 64 units
model.add(tf.keras.layers.LSTM(128))
#model.add(tf.keras.layers.LSTM(64)) #-- Apparently this should not be added...

#3. A Dense layer with 100 units, and relu activation
model.add(tf.keras.layers.Dense(units=100, activation='relu'))

#4. A dropout layer with a dropout rate of 50%
model.add(tf.keras.layers.Dropout(0.50))

#5.A dense layer with the following parameters:
    #Activation function is softmax
    #The number of units is 13 -- It would be so much better, if there weren't so many typos in the instructions...
model.add(tf.keras.layers.Dense(units=13, activation='softmax'))

In [15]:
#Build the network using the following parameters:
    #Optimizer: Adam
    #Loss function: categorical_crossentropy
    #Metrics: accuracy
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 37, 10)            489980    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               71168     
_________________________________________________________________
dense (Dense)                (None, 100)               12900     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 13)                1313      
Total params: 575,361
Trainable params: 575,361
Non-trainable params: 0
_________________________________________________________________


In [17]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [18]:
model.fit(x_train, y_train, epochs=10, batch_size=256,callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a4298d950>

In [19]:
loss, accuracy = model.evaluate(x_test, y_test)



In [20]:
print("loss:",loss)
print("accuracy:",accuracy)

loss: 3.243500232696533
accuracy: 0.2562499940395355
