In [None]:
from keras.layers import Embedding, Dense, Dropout, Flatten, GRU
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow import keras
from keras.preprocessing import sequence
import pandas as pd
import numpy as np

In [3]:
def create_bow_vectors(vectorizer, train_data, test_data, input_length):
    # Create raw vectors
    train_vectors, test_vectors = vectorizer.transform(train_data).toarray(), vectorizer.transform(test_data).toarray()

    # Create bag-of-words vectors for each data
    bow_train, bow_test = [], []
    for vector in train_vectors:
        bow_train.append(np.where(vector == 1)[0])
    for vector in test_vectors:
        bow_test.append(np.where(vector == 1)[0])

    # Add padding to bow vectors
    processed_train = sequence.pad_sequences(bow_train, input_length)
    processed_test = sequence.pad_sequences(bow_test, input_length)

    return processed_train, processed_test

In [4]:
def create_embedding_vectors(filename):
    embedding_vectors = {}
    with open(filename, 'r', encoding='utf-8') as file:
        for row in file:
            values = row.split(' ')
            word = values[0]
            weights = np.asarray([float(val) for val in values[1:]])
            embedding_vectors[word] = weights
    print(f"Size of vectorized vocabulary: {len(embedding_vectors)}")
    return embedding_vectors

In [5]:
def create_embedding_matrix(vocabulary, embedding_vectors, oov_words, emb_dim=300):
    vocab_size = len(set(vocabulary))
    embedding_matrix = np.zeros((vocab_size, emb_dim))

    for word, idx in vocabulary.items():
        if idx < vocab_size:
            embedding_vector = embedding_vectors.get(word)
            if embedding_vector is not None:
                embedding_matrix[idx] = embedding_vector
            else:
                oov_words.append(word)
                # Random initialization for out of vocabulary words
                embedding_matrix[idx] = np.random.uniform(low=-1.0, high=1.0, size=emb_dim)

    return embedding_matrix

In [12]:
def get_prediction_results(model, test_data, labels_test):
    predictions = model.predict(test_data)
    predicted_labels = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels_test, predicted_labels)
    report = classification_report(labels_test, predicted_labels, zero_division=1)
    cm = confusion_matrix(labels_test, predicted_labels)

    return accuracy, report, cm

In [7]:
# Load dataset
dataset = pd.read_csv("/kaggle/input/emotions/text.csv")

# Extract data and labels from dataset
data, labels = dataset['text'], dataset['label']

# Split data into train and test data
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.3, random_state=42)

In [8]:
# Create count vectorizer
n = 1
vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
vectorizer.fit(train_data)

VOCABULARY_SIZE = len(set(vectorizer.vocabulary_))
MAX_LENGTH = max(len(max(train_data, key=len)), len(max(test_data, key=len)))

# Create bow vectors for each train and test data
train_vectors, test_vectors = create_bow_vectors(vectorizer, train_data, test_data, input_length=MAX_LENGTH)

In [9]:
# Create embedding vectors
embedding_vectors = create_embedding_vectors('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt')

# Create embedding matrix and out-of-vocabulary list
oov_words = []
embedding_matrix = create_embedding_matrix(vectorizer.vocabulary_, embedding_vectors, oov_words, emb_dim=300)

# Print some of the out of vocabulary words
print(f'Number of out of vocabulary words: {len(oov_words)}')
print(f'Some out of vocabulary words: {oov_words[0:5]}')

Size of vectorized vocabulary: 2196017
Number of out of vocabulary words: 9127
Some out of vocabulary words: ['bodyjar', 'glimsp', 'immoduim', 'qasmaxs', 'wnfmcgill']


In [10]:
# Create the model
model = keras.Sequential()
embedding_layer = Embedding(VOCABULARY_SIZE, output_dim=embedding_matrix.shape[1], trainable=True)
embedding_layer.build((None,))
embedding_layer.set_weights([embedding_matrix])
model.add(embedding_layer)
model.add(GRU(128, return_sequences=True))
model.add(GRU(64, return_sequences=True))
model.add(GRU(32, return_sequences=False))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [13]:
# Train the model
model.fit(train_vectors, train_labels, epochs=10, batch_size=128, validation_split=0.3,
          callbacks=[keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=1, restore_best_weights=True)])

Epoch 1/10
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 142ms/step - accuracy: 0.9059 - loss: 0.2195 - val_accuracy: 0.8991 - val_loss: 0.2111
Epoch 2/10
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 141ms/step - accuracy: 0.9161 - loss: 0.1881 - val_accuracy: 0.8947 - val_loss: 0.2184


<keras.src.callbacks.history.History at 0x7e8934159480>

In [14]:
# Print results
accuracy, report, cm = get_prediction_results(model, test_vectors, test_labels)

print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)
print(f"Accuracy: {accuracy:.2f}")

[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 26ms/step

Confusion Matrix:
 [[33996   572   176   521   928    68]
 [  576 37150  3277   352   429   471]
 [  106   303  9685    83    34    30]
 [ 1173   252    56 15621   173    20]
 [  323   116    49   879 12654   373]
 [   92    86    16    38   922  3443]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94     36261
           1       0.97      0.88      0.92     42255
           2       0.73      0.95      0.82     10241
           3       0.89      0.90      0.90     17295
           4       0.84      0.88      0.86     14394
           5       0.78      0.75      0.76      4597

    accuracy                           0.90    125043
   macro avg       0.86      0.88      0.87    125043
weighted avg       0.91      0.90      0.90    125043

Accuracy: 0.90
