In [22]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import re

In [23]:
# Load Pretrained Model
path = '../Pretrained_Models/'
w2v_model = Word2Vec.load('../Pretrained_Models/TREC_pretrain.model')

pretrained_weights = w2v_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape

print("Vocab Size: ", vocab_size)
print("Embedding Size: ", embedding_size)

Vocab Size:  9447
Embedding Size:  100


In [24]:
path = '../Datasets/Processed/TREC'

training_dev_df = pd.read_csv(f'{path}/train.dev.csv')
training_df = pd.read_csv(f'{path}/train.csv')
test_df = pd.read_csv(f'{path}/test.csv')

In [25]:
label_encoder = LabelEncoder()

training_dev_df['label-coarse'] = label_encoder.fit_transform(training_dev_df['label-coarse'])
test_df['label-coarse'] = label_encoder.fit_transform(test_df['label-coarse'])

In [26]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_dev_df['text'])

# Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(training_dev_df['text'])
X_test_sequences = tokenizer.texts_to_sequences(test_df['text'])


In [27]:
# Padding sequences
sequence_length = 50  # Choose an appropriate sequence length
X_train_padded = pad_sequences(X_train_sequences, maxlen=sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=sequence_length, padding='post')

# Convert labels to one-hot encoded format
y_train = to_categorical(training_dev_df['label-coarse'])
y_test = to_categorical(test_df['label-coarse'])


In [28]:
num_classes = 5

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=sequence_length))
model.add(LSTM(units=100))  # You can adjust the number of LSTM units
model.add(Dense(units=num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
num_epochs = 1000
batch_size = 100

model.fit(X_train_padded, y_train, epochs=num_epochs, batch_size=batch_size, validation_split=0.1)

Epoch 1/10


2023-10-17 15:20:47.718461: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8600
2023-10-17 15:20:47.828734: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f865c00a0c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-10-17 15:20:47.828768: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2060, Compute Capability 7.5
2023-10-17 15:20:47.833118: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-10-17 15:20:47.902217: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f87b8115d90>

In [30]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 2.793548583984375, Test Accuracy: 0.16223648190498352
