In [23]:
import pandas as pd
import numpy as np
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.metrics import classification_report, confusion_matrix

In [24]:
train = pd.read_csv("../data/train_data.csv")
valid = pd.read_csv("../data/valid_data.csv")

X_train = train["text"].astype(str)
y_train = train["label"].astype(int).values

X_val = valid["text"].astype(str)
y_val = valid["label"].astype(int).values

print("Train size:", len(X_train))
print("Valid size:", len(X_val))


Train size: 16990
Valid size: 4117


In [25]:
def clean_text(text):
    text = text.lower()
    # remove urls
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    # remove @mentions and hashtags symbols (keep word part)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

X_train_clean = X_train.apply(clean_text)
X_val_clean   = X_val.apply(clean_text)

X_train_clean.head()


0    here are thursday s biggest analyst calls appl...
1    buy las vegas sands as travel to singapore bui...
2    piper sandler downgrades docusign to sell citi...
3    analysts react to tesla s latest earnings brea...
4    netflix and its peers are set for a return to ...
Name: text, dtype: object

In [26]:
max_words = 20000   # vocab size
max_len = 60      # tweets are short; shorter seq helps

# number of labels in the dataset (multi-class)
num_classes = len(np.unique(y_train))

tokenizer = Tokenizer(num_words=max_words, oov_token="<unk>")
tokenizer.fit_on_texts(X_train_clean)

X_train_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_train_clean),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_val_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_val_clean),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_train_seq.shape, X_val_seq.shape


((16990, 60), (4117, 60))

In [27]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=max_words,
                    output_dim=embedding_dim,
                    input_length=max_len))

# BiLSTM + extra Dense layer
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation="softmax"))   # multi-class output

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()




In [28]:
batch_size = 64
epochs = 15   # we will stop early if overfitting

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    verbose=1
)

history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


Epoch 1/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 115ms/step - accuracy: 0.3625 - loss: 2.0977 - val_accuracy: 0.5120 - val_loss: 1.7195 - learning_rate: 0.0010
Epoch 2/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 130ms/step - accuracy: 0.7022 - loss: 0.9934 - val_accuracy: 0.6774 - val_loss: 1.2419 - learning_rate: 0.0010
Epoch 3/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 125ms/step - accuracy: 0.8473 - loss: 0.5277 - val_accuracy: 0.7267 - val_loss: 1.1547 - learning_rate: 0.0010
Epoch 4/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 125ms/step - accuracy: 0.9031 - loss: 0.3358 - val_accuracy: 0.7323 - val_loss: 1.3723 - learning_rate: 0.0010
Epoch 5/15
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - accuracy: 0.9400 - loss: 0.2123
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [29]:
# raw predictions
y_val_prob = model.predict(X_val_seq)
y_val_pred = np.argmax(y_val_prob, axis=1)

print("Confusion matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification report:")
print(classification_report(y_val, y_val_pred, digits=4))


[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step
Confusion matrix:
[[ 17   0   5   1   0   2   0   7   0   4   0   0   0   1   1   1   0   0
   10  24]
 [  0 174   0   0   0   0   1   0   2   0   0   0   1   0  20   2   8   4
    2   0]
 [ 16   2 516   1   1   6   4   8   1  44   0   0   2  11  19   3   1  15
  175  27]
 [  1  12   2  19   0   1  12   0   3   5   0   0   1   0   5  10   0   0
    3   3]
 [  0   0   0   0  90   1   0   1   0   0   0   0   0   0   0   3   0   0
    0   2]
 [  1   0   3   0   1 229   0   4   0   0   0   0   0   0   0   1   0   0
    1   2]
 [  2   0   0   0   0   0 117   0   0   4   0   0   0   0  13   6   1   0
    3   0]
 [  4   0   4   0   0   6   0 126   2   0   0   0   0   0   0   0   0   2
    2  14]
 [  0   6   0   1   0   0   2   0  17   0   0   0   0   0   4   2   0   0
    0   0]
 [  1   3  11   0   0   0   9   0   2 242   0   0   0   0  28   1  17   0
   21   1]
 [  0   0   1   1   0   0   5   1   1   0   0   0   1   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [30]:
# Save classification report to file
report_text = classification_report(y_val, y_val_pred, digits=4, zero_division=0)

with open("../results/rnn_classification_report.txt", "w") as f:
    f.write(report_text)

# Save confusion matrix
import numpy as np

np.savetxt("../results/rnn_confusion_matrix.csv", confusion_matrix(y_val, y_val_pred), fmt="%d", delimiter=",")


FileNotFoundError: [Errno 2] No such file or directory: '../results/rnn_classification_report.txt'