# 1 - Download the Dataset
Download the dataset from the following link:
https://data.world/crowdflower/sentiment-analysis-in-text

In [65]:
import pandas as pd
import numpy as np
df = pd.read_csv('text_emotion.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [66]:
df = df.drop(['tweet_id', 'author'], axis=1)

In [67]:
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [68]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [69]:
df['content'] = df['content'].str.lower()
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,layin n bed with a headache ughhhh...waitin o...
2,sadness,funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends soon!
4,neutral,@dannycastillo we want to trade with someone w...


In [70]:
# Preprocess the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['content'])
tokens = tokenizer.word_index

In [71]:
# Calculate the vocabulary size
vocabulary_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: ', vocabulary_size)

Vocabulary Size:  48998


In [72]:
# Transform text to sequence of integers
sequences = tokenizer.texts_to_sequences(df['content'])

In [73]:
# Calculate the maximum sequence length
max_sequence_length = max([len(seq) for seq in sequences])
print('Max Sequence Length: ', max_sequence_length)

Max Sequence Length:  37


In [74]:
# Pad sequences to the same length
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [75]:
Y = df['sentiment']
print(np.unique(Y))
num_classes = len(np.unique(Y))
print(num_classes)

['anger' 'boredom' 'empty' 'enthusiasm' 'fun' 'happiness' 'hate' 'love'
 'neutral' 'relief' 'sadness' 'surprise' 'worry']
13


In [76]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Instantiate the encoder
le = LabelEncoder()

# Fit and transform the labels
labels_encoded = le.fit_transform(Y)


In [77]:
# Convert to one-hot encoding
Y = to_categorical(labels_encoded, num_classes=num_classes)
Y

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [78]:
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [79]:
X_train

array([[    0,     0,     0, ...,   320,   453,   625],
       [    0,     0,     0, ...,   488,  3442,     8],
       [    0,     0,     0, ...,     1,    55,    50],
       ...,
       [    0,     0,     0, ..., 40706,   438,   187],
       [    0,     0,     0, ...,    32,    24,  2947],
       [    0,     0,     0, ...,  2061,  3604,   579]])

In [80]:
Y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

# 3 - Define your Recurrent Neural Network
Define an RNN with the following layers:

The input layer is an embedding layer with the following parameters:
the input dimension is the vocabulary size;
the output dimension is 10;
the input length is the maximum sequence length;

Define an LSTM layer with 128 units;

Define an LSTM layer with 64 units;

Define a fully connected layer with:
 100 units;
 activation function: ReLU;

Dropout layer with 0.5 rate;

The output layer is a fully-connected layer with:
5 units and activation function: softmax.

In [84]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential()

# Add the input Embedding layer
model.add(Embedding(vocabulary_size,10, input_length=max_sequence_length))

# Add the LSTM layers
model.add(LSTM(128, return_sequences=True))  # Return sequences for the next LSTM layer
model.add(LSTM(64))

# Add a fully connected layer
model.add(Dense(100, activation='relu'))

# Add Dropout layer
model.add(Dropout(0.5))

# Add the output layer
model.add(Dense(13, activation='softmax'))

# Summary of the model
model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 37, 10)            489980    
                                                                 
 lstm_4 (LSTM)               (None, 37, 128)           71168     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_4 (Dense)             (None, 100)               6500      
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 13)                1313      
                                                                 
Total params: 618,369
Trainable params: 618,369
Non-tr

# 4 - Choosing Hyperparameters

Build the network using the following parameters:
- Optimizer: Adam
- Loss function: categorical_crossentropy
- Metrics: accuracy
- Batch size: 256
- Epochs: 10

In [85]:
# need to compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 5 - Training Network
Use Keras to implement the network described and train your data.
Classification metrics:
Print the accuracy measure on the testing data.

In [86]:
# Train the model
rnn = model.fit(X_train, Y_train, batch_size=256, epochs=10, validation_data=(X_test, Y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [87]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, Y_test)

# Print the accuracy
print("Accuracy on test data: {:.2%}".format(accuracy))


Accuracy on test data: 24.71%
