In [7]:
import pandas as pd
import numpy as np
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.metrics import classification_report, confusion_matrix

In [8]:
train = pd.read_csv("../data/train_data.csv")
valid = pd.read_csv("../data/valid_data.csv")

X_train = train["text"].astype(str)
y_train = train["label"].astype(int).values

X_val = valid["text"].astype(str)
y_val = valid["label"].astype(int).values

print("Train size:", len(X_train))
print("Valid size:", len(X_val))


Train size: 16990
Valid size: 4117


In [9]:
def clean_text(text):
    text = text.lower()
    # remove urls
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    # remove @mentions and hashtags symbols (keep word part)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

X_train_clean = X_train.apply(clean_text)
X_val_clean   = X_val.apply(clean_text)

X_train_clean.head()

0    here are thursday s biggest analyst calls appl...
1    buy las vegas sands as travel to singapore bui...
2    piper sandler downgrades docusign to sell citi...
3    analysts react to tesla s latest earnings brea...
4    netflix and its peers are set for a return to ...
Name: text, dtype: object

In [10]:
max_words = 20000   # vocab size
max_len = 60      # tweets are short; shorter seq helps

# number of labels in the dataset (multi-class)
num_classes = len(np.unique(y_train))

tokenizer = Tokenizer(num_words=max_words, oov_token="<unk>")
tokenizer.fit_on_texts(X_train_clean)

X_train_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_train_clean),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_val_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_val_clean),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_train_seq.shape, X_val_seq.shape


((16990, 60), (4117, 60))

In [11]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=max_words,
                    output_dim=embedding_dim,
                    input_length=max_len))

# BiLSTM + extra Dense layer
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation="softmax"))   # multi-class output

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()




In [12]:
batch_size = 64
epochs = 15   # we will stop early if overfitting

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    verbose=1
)

history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


Epoch 1/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 106ms/step - accuracy: 0.3663 - loss: 2.0923 - val_accuracy: 0.5698 - val_loss: 1.4784 - learning_rate: 0.0010
Epoch 2/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 123ms/step - accuracy: 0.7245 - loss: 0.9326 - val_accuracy: 0.6867 - val_loss: 1.2170 - learning_rate: 0.0010
Epoch 3/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 122ms/step - accuracy: 0.8597 - loss: 0.4949 - val_accuracy: 0.7452 - val_loss: 1.0015 - learning_rate: 0.0010
Epoch 4/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 130ms/step - accuracy: 0.9149 - loss: 0.3016 - val_accuracy: 0.7450 - val_loss: 1.1209 - learning_rate: 0.0010
Epoch 5/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.9436 - loss: 0.1996
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [15]:
# raw predictions
y_val_prob = model.predict(X_val_seq)
y_val_pred = np.argmax(y_val_prob, axis=1)

print("Confusion matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification report:")
print(classification_report(y_val, y_val_pred, digits=4))


[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step
Confusion matrix:
[[  5   0   7   0   0   2   0  10   0   0   0   0   0   9   2   1   0   0
    7  30]
 [  0 169   1   5   0   0   3   2   3   3   0   0   0   1  15   1   6   2
    1   2]
 [  2   1 534   2   3   3   3   7   0  20   0   0   8  29   5   3   7  35
  111  79]
 [  0   1   2  53   0   0   1   0   1   1   0   0   1   3   0   5   2   1
    4   2]
 [  0   0   1   0  91   1   0   0   0   0   0   0   0   0   0   1   0   0
    1   2]
 [  0   0   1   0   0 227   0   4   0   0   0   0   0   1   0   0   0   0
    1   8]
 [  1   2   3   0   0   0 106   0   2   6   0   0   0   0  10  11   0   0
    4   1]
 [  3   0   5   1   0   7   0 120   0   0   0   0   0   1   1   0   0   0
    0  22]
 [  0   1   0   0   0   0   0   1  19   1   0   0   0   0   3   4   0   0
    3   0]
 [  0   2  21   5   0   1   1   0   3 252   0   0   1   1  16   2   7   0
   22   2]
 [  0   1   0   1   0   0   0   1   3   1   0   0   0   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [16]:
# Save classification report to file
report_text = classification_report(y_val, y_val_pred, digits=4, zero_division=0)

with open("../results/rnn_classification_report.txt", "w") as f:
    f.write(report_text)

# Save confusion matrix
import numpy as np

np.savetxt("../results/rnn_confusion_matrix.csv", confusion_matrix(y_val, y_val_pred), fmt="%d", delimiter=",")
