In [17]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    Bidirectional,
    Dense,
    Dropout,
    SpatialDropout1D,
    BatchNormalization,
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [18]:
# Reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

pd.set_option("display.max_colwidth", 100)

In [19]:
# Load data
train_df = pd.read_csv("../data/train_data.csv")
valid_df = pd.read_csv("../data/valid_data.csv")

train_df["text"] = train_df["text"].fillna("")
valid_df["text"] = valid_df["text"].fillna("")

In [None]:
# Text cleaning (same behavior as original)
def clean_financial_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()

    # URLs
    text = re.sub(r"http\S+|www\.\S+|https\S+", "", text)

    # Mentions
    text = re.sub(r"@\w+", "", text)

    # HTML entities
    text = re.sub(r"&\w+;", " ", text)

    # Keep letters, numbers, spaces, $, %, dots, commas
    text = re.sub(r"[^a-z0-9\s\$\%\.\,]", " ", text)

    # Normalize stock tickers: "$ aapl" -> "$aapl"
    text = re.sub(r"\$\s+([a-z]+)", r"$\1", text)

    # Normalize percentages: "10  %" -> "10%"
    text = re.sub(r"(\d+)\s*%", r"\1%", text)

    text = re.sub(r"\s+", " ", text).strip()
    return text


In [None]:
max_words = 20000   # vocab size
max_len = 60      # tweets are short; shorter seq helps

# number of labels in the dataset (multi-class)
num_classes = len(np.unique(y_train))

tokenizer = Tokenizer(num_words=max_words, oov_token="<unk>")
tokenizer.fit_on_texts(X_train_clean)

X_train_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_train_clean),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_val_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_val_clean),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_train_seq.shape, X_val_seq.shape


In [None]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=max_words,
                    output_dim=embedding_dim,
                    input_length=max_len))

# BiLSTM + extra Dense layer
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation="softmax"))   # multi-class output

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [None]:
batch_size = 64
epochs = 15   # we will stop early if overfitting

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    verbose=1
)

history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


In [None]:
# raw predictions
y_val_prob = model.predict(X_val_seq)
y_val_pred = np.argmax(y_val_prob, axis=1)

print("Confusion matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification report:")
print(classification_report(y_val, y_val_pred, digits=4))


In [None]:
# Save classification report to file
report_text = classification_report(y_val, y_val_pred, digits=4, zero_division=0)

with open("../results/rnn_classification_report.txt", "w") as f:
    f.write(report_text)

# Save confusion matrix
import numpy as np

np.savetxt("../results/rnn_confusion_matrix.csv", confusion_matrix(y_val, y_val_pred), fmt="%d", delimiter=",")
