In [None]:
# Download the aclImdb dataset
!wget -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

In [None]:
!pip install gensim sentence_transformers

In [6]:
# Imports and logging
import os
import re
import string
import time
import logging
from datetime import datetime
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, optimizers, callbacks

import gensim
from gensim.models import Word2Vec
import gensim.downloader as api

from sentence_transformers import SentenceTransformer

# Clean logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger(__name__)

# Ensure reproducibility (basic)
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [7]:
DATA_DIR = "/content/aclImdb"  # update if needed

def read_reviews(base_dir: str) -> Tuple[List[str], List[int], List[str], List[int]]:
    """
    Reads IMDB reviews keeping the original split: train/test.
    Labels: pos=1, neg=0.
    """
    def read_folder(path: str, label: int):
        texts, labels = [], []
        for fname in sorted(os.listdir(path)):
            fpath = os.path.join(path, fname)
            if os.path.isfile(fpath):
                with open(fpath, "r", encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(label)
        return texts, labels

    train_pos, y_train_pos = read_folder(os.path.join(base_dir, "train", "pos"), 1)
    train_neg, y_train_neg = read_folder(os.path.join(base_dir, "train", "neg"), 0)
    test_pos, y_test_pos = read_folder(os.path.join(base_dir, "test", "pos"), 1)
    test_neg, y_test_neg = read_folder(os.path.join(base_dir, "test", "neg"), 0)

    X_train = train_pos + train_neg
    y_train = y_train_pos + y_train_neg
    X_test = test_pos + test_neg
    y_test = y_test_pos + y_test_neg
    return X_train, y_train, X_test, y_test

logger.info("Loading dataset...")
X_train_raw, y_train, X_test_raw, y_test = read_reviews(DATA_DIR)
logger.info(f"Loaded train={len(X_train_raw)}, test={len(X_test_raw)}")


In [8]:
# Text cleaning: remove HTML, lowercase, remove punctuation, simple tokenization by whitespace
HTML_RE = re.compile(r"<.*?>")
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def clean_text(text: str) -> str:
    text = HTML_RE.sub(" ", text)
    text = text.lower()
    text = text.translate(PUNCT_TABLE)
    tokens = text.split()
    return " ".join(tokens)

logger.info("Cleaning texts...")
X_train = [clean_text(t) for t in X_train_raw]
X_test = [clean_text(t) for t in X_test_raw]


In [9]:
# Tokenizer: build vocabulary on training set for word-index sequences
VOCAB_SIZE = 50000  # cap vocab for stability
MAX_LEN = 128       # LSTM sequence length (truncate/pad to this)

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

def to_sequences(texts: List[str], max_len: int = MAX_LEN) -> np.ndarray:
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=max_len, padding="post", truncating="post")

# Metrics and confusion matrix
def evaluate_and_log(name: str, y_true: List[int], y_pred: List[int]) -> Dict[str, float]:
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    logger.info(f"[{name}] Test Accuracy={acc:.4f} | Precision={prec:.4f} | Recall={rec:.4f} | F1={f1:.4f}")
    logger.info(f"[{name}] Confusion matrix:\n{cm}")
    return {"Embedding": name, "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1}

# Simple LSTM model builder (Keras)
def build_lstm(input_dim: int, embed_dim: int, lstm_hidden: int, num_layers: int, dropout: float,
               embedding_matrix: np.ndarray = None, trainable_embed: bool = True,
               time_steps: int = MAX_LEN, feature_dim: int = None):
    """
    Builds an LSTM classifier.
    Two modes:
      - Token index input + Embedding layer (use input_dim & embedding_matrix)
      - Precomputed feature sequences (use feature_dim and skip Embedding)
    """
    inputs = layers.Input(shape=(time_steps,)) if feature_dim is None else layers.Input(shape=(time_steps, feature_dim))

    if feature_dim is None:
        # Token index input with Embedding
        if embedding_matrix is not None:
            emb = layers.Embedding(input_dim=input_dim, output_dim=embed_dim,
                                   weights=[embedding_matrix], trainable=trainable_embed, mask_zero=True)(inputs)
        else:
            emb = layers.Embedding(input_dim=input_dim, output_dim=embed_dim, trainable=trainable_embed, mask_zero=True)(inputs)
        x = emb
    else:
        # Feature sequences provided (e.g., tf-idf per token, BERT chunk features)
        x = inputs

    # Stack LSTM layers
    for i in range(num_layers):
        return_sequences = (i < num_layers - 1)
        x = layers.LSTM(lstm_hidden, return_sequences=return_sequences)(x)
        x = layers.Dropout(dropout)(x)

    out = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inputs, out)
    return model

# Training loop with logging (per epoch)
def train_lstm(model, X_train_data, y_train, X_val_data, y_val,
               lr=1e-3, batch_size=64, epochs=5, name="model"):
    logger.info(f"[{name}] Training start: {datetime.now()}")
    model.compile(optimizer=optimizers.Adam(learning_rate=lr),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    hist = model.fit(
        X_train_data, np.array(y_train),
        validation_data=(X_val_data, np.array(y_val)),
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)],
        verbose=1
    )
    logger.info(f"[{name}] Training end: {datetime.now()}")
    return hist


In [10]:
# Build sequences for token-index inputs (used by W2V and pretrained embeddings)
X_train_seq = to_sequences(X_train, MAX_LEN)
X_test_seq = to_sequences(X_test, MAX_LEN)

# Build a small validation split from training data (stratified would be ideal; simple split here)
VAL_RATIO = 0.1
n_train = len(X_train)
n_val = int(n_train * VAL_RATIO)

X_val_seq = X_train_seq[:n_val]
y_val = y_train[:n_val]
X_train_seq_model = X_train_seq[n_val:]
y_train_model = y_train[n_val:]


In [None]:
# Compute TF-IDF for vocabulary terms, then map to per-token weights in sequence order.
tfidf_uni_vec = TfidfVectorizer(ngram_range=(1,1), max_features=VOCAB_SIZE)
tfidf_uni_vec.fit(X_train)

# Build per-token tf-idf weight sequences
def tfidf_sequence(texts, tokenizer, vectorizer, max_len=MAX_LEN):
    # Map token -> tf-idf idf weight (global). We'll use IDF for per-token static weight.
    idf_map = {t: vectorizer.idf_[i] for t, i in vectorizer.vocabulary_.items()}
    seqs = tokenizer.texts_to_sequences(texts)
    feats = []
    for seq, txt in zip(seqs, texts):
        tokens = txt.split()
        # Align indices in sequence to token list length
        weights = []
        ti = 0
        for idx in seq[:max_len]:
            # find the original token string; fallback to zero if out-of-range
            tok = tokens[ti] if ti < len(tokens) else ""
            ti += 1
            w = idf_map.get(tok, 0.0)
            weights.append([w])
        # pad
        while len(weights) < max_len:
            weights.append([0.0])
        feats.append(weights)
    return np.array(feats, dtype=np.float32)

X_train_tfidf_uni_feat_all = tfidf_sequence(X_train, tokenizer, tfidf_uni_vec, MAX_LEN)
X_test_tfidf_uni_feat = tfidf_sequence(X_test, tokenizer, tfidf_uni_vec, MAX_LEN)

# Create val split aligned with earlier split
X_val_tfidf_uni_feat = X_train_tfidf_uni_feat_all[:n_val]
X_train_tfidf_uni_feat = X_train_tfidf_uni_feat_all[n_val:]

# Hyperparameters to try (small grid)
grid_tfidf_uni = [
    {"lstm_hidden": 64, "num_layers": 1, "dropout": 0.2, "lr": 1e-3, "batch_size": 64, "epochs": 4},
    {"lstm_hidden": 128, "num_layers": 1, "dropout": 0.3, "lr": 1e-3, "batch_size": 64, "epochs": 4},
]

best_val_acc_tfidf_uni = 0.0
best_params_tfidf_uni = None
best_model_tfidf_uni = None

for p in grid_tfidf_uni:
    name = f"TF-IDF(unigram)_LSTM_{p}"
    logger.info(f"=== Training {name} ===")
    model = build_lstm(
        input_dim=None, embed_dim=None, lstm_hidden=p["lstm_hidden"], num_layers=p["num_layers"],
        dropout=p["dropout"], embedding_matrix=None, trainable_embed=False,
        time_steps=MAX_LEN, feature_dim=1  # feature sequence: 1-d per token
    )
    hist = train_lstm(model, X_train_tfidf_uni_feat, y_train_model, X_val_tfidf_uni_feat, y_val,
                      lr=p["lr"], batch_size=p["batch_size"], epochs=p["epochs"], name=name)
    val_acc = max(hist.history["val_accuracy"])
    if val_acc > best_val_acc_tfidf_uni:
        best_val_acc_tfidf_uni = val_acc
        best_params_tfidf_uni = p
        best_model_tfidf_uni = model

logger.info(f"[TF-IDF(unigram)] Best val accuracy: {best_val_acc_tfidf_uni:.4f} | Params: {best_params_tfidf_uni}")

# Test evaluation
y_pred_prob = best_model_tfidf_uni.predict(X_test_tfidf_uni_feat, batch_size=128).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)
metrics_tfidf_uni = evaluate_and_log("TF-IDF (unigram) seq", y_test, y_pred)


In [None]:
# Build bigram-augmented texts
def add_bigrams(texts):
    augmented = []
    for t in texts:
        toks = t.split()
        bigrams = ["{}_{}".format(toks[i], toks[i+1]) for i in range(len(toks)-1)]
        augmented.append(t + " " + " ".join(bigrams))
    return augmented

X_train_aug = add_bigrams(X_train)
X_test_aug = add_bigrams(X_test)

# Fit TF-IDF on augmented text (unigram+bigram)
tfidf_unibi_vec = TfidfVectorizer(ngram_range=(1,2), max_features=VOCAB_SIZE)
tfidf_unibi_vec.fit(X_train_aug)

# Build sequences using original tokenizer indexes but feature from augmented vocabulary IDF
def tfidf_unibi_sequence(texts, tokenizer, vectorizer, max_len=MAX_LEN):
    idf_map = {t: vectorizer.idf_[i] for t, i in vectorizer.vocabulary_.items()}
    seqs = tokenizer.texts_to_sequences(texts)
    feats = []
    for seq, txt in zip(seqs, texts):
        tokens = txt.split()
        weights = []
        ti = 0
        for idx in seq[:max_len]:
            tok = tokens[ti] if ti < len(tokens) else ""
            ti += 1
            # try unigram IDF first; bigram signal is indirectly captured via augmented fitting
            w = idf_map.get(tok, 0.0)
            weights.append([w])
        while len(weights) < max_len:
            weights.append([0.0])
        feats.append(weights)
    return np.array(feats, dtype=np.float32)

X_train_tfidf_unibi_feat_all = tfidf_unibi_sequence(X_train, tokenizer, tfidf_unibi_vec, MAX_LEN)
X_test_tfidf_unibi_feat = tfidf_unibi_sequence(X_test, tokenizer, tfidf_unibi_vec, MAX_LEN)
X_val_tfidf_unibi_feat = X_train_tfidf_unibi_feat_all[:n_val]
X_train_tfidf_unibi_feat = X_train_tfidf_unibi_feat_all[n_val:]

grid_tfidf_unibi = [
    {"lstm_hidden": 64, "num_layers": 1, "dropout": 0.2, "lr": 1e-3, "batch_size": 64, "epochs": 4},
    {"lstm_hidden": 128, "num_layers": 1, "dropout": 0.3, "lr": 1e-3, "batch_size": 64, "epochs": 4},
]

best_val_acc_tfidf_unibi = 0.0
best_params_tfidf_unibi = None
best_model_tfidf_unibi = None

for p in grid_tfidf_unibi:
    name = f"TF-IDF(unigram+bigram)_LSTM_{p}"
    logger.info(f"=== Training {name} ===")
    model = build_lstm(
        input_dim=None, embed_dim=None, lstm_hidden=p["lstm_hidden"], num_layers=p["num_layers"],
        dropout=p["dropout"], embedding_matrix=None, trainable_embed=False,
        time_steps=MAX_LEN, feature_dim=1
    )
    hist = train_lstm(model, X_train_tfidf_unibi_feat, y_train_model, X_val_tfidf_unibi_feat, y_val,
                      lr=p["lr"], batch_size=p["batch_size"], epochs=p["epochs"], name=name)
    val_acc = max(hist.history["val_accuracy"])
    if val_acc > best_val_acc_tfidf_unibi:
        best_val_acc_tfidf_unibi = val_acc
        best_params_tfidf_unibi = p
        best_model_tfidf_unibi = model

logger.info(f"[TF-IDF(unigram+bigram)] Best val accuracy: {best_val_acc_tfidf_unibi:.4f} | Params: {best_params_tfidf_unibi}")

y_pred_prob = best_model_tfidf_unibi.predict(X_test_tfidf_unibi_feat, batch_size=128).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)
metrics_tfidf_unibi = evaluate_and_log("TF-IDF (unigram+bigram) seq", y_test, y_pred)


In [None]:
# Train Word2Vec (CBOW)
train_tokens = [t.split() for t in X_train]
w2v_dim = 100
logger.info("[Word2Vec CBOW] Training gensim Word2Vec...")
w2v_cbow = Word2Vec(sentences=train_tokens, vector_size=w2v_dim, window=5, min_count=2, workers=os.cpu_count(), sg=0, epochs=5)

# Build embedding matrix aligned to Keras tokenizer index
word_index = tokenizer.word_index
num_words = min(VOCAB_SIZE, len(word_index) + 1)
embedding_matrix_w2v = np.random.normal(scale=0.01, size=(num_words, w2v_dim)).astype(np.float32)

for word, idx in word_index.items():
    if idx >= num_words:
        continue
    if word in w2v_cbow.wv:
        embedding_matrix_w2v[idx] = w2v_cbow.wv[word]

# Hyperparameter grid
grid_w2v = [
    {"embed_dim": w2v_dim, "lstm_hidden": 128, "num_layers": 1, "dropout": 0.3, "lr": 1e-3, "batch_size": 64, "epochs": 4, "trainable_embed": True},
    {"embed_dim": w2v_dim, "lstm_hidden": 128, "num_layers": 2, "dropout": 0.3, "lr": 1e-3, "batch_size": 64, "epochs": 4, "trainable_embed": True},
]

best_val_acc_w2v = 0.0
best_params_w2v = None
best_model_w2v = None

for p in grid_w2v:
    name = f"Word2Vec(trainable)_LSTM_{p}"
    logger.info(f"=== Training {name} ===")
    model = build_lstm(
        input_dim=num_words, embed_dim=p["embed_dim"], lstm_hidden=p["lstm_hidden"], num_layers=p["num_layers"],
        dropout=p["dropout"], embedding_matrix=embedding_matrix_w2v, trainable_embed=p["trainable_embed"],
        time_steps=MAX_LEN
    )
    hist = train_lstm(model, X_train_seq_model, y_train_model, X_val_seq, y_val,
                      lr=p["lr"], batch_size=p["batch_size"], epochs=p["epochs"], name=name)
    val_acc = max(hist.history["val_accuracy"])
    if val_acc > best_val_acc_w2v:
        best_val_acc_w2v = val_acc
        best_params_w2v = p
        best_model_w2v = model

logger.info(f"[Word2Vec(trainable)] Best val accuracy: {best_val_acc_w2v:.4f} | Params: {best_params_w2v}")

y_pred_prob = best_model_w2v.predict(X_test_seq, batch_size=128).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)
metrics_w2v = evaluate_and_log("Word2Vec (trainable)", y_test, y_pred)


In [None]:
# Load pretrained embeddings
try:
    kv = api.load("word2vec-google-news-300")  # large (~1.5GB)
    pre_dim = 300
    pre_name = "Pretrained Word2Vec (GoogleNews)"
except Exception:
    kv = api.load("fasttext-wiki-news-subwords-300")
    pre_dim = 300
    pre_name = "Pretrained FastText (WikiNews)"

word_index = tokenizer.word_index
num_words = min(VOCAB_SIZE, len(word_index) + 1)
embedding_matrix_pre = np.random.normal(scale=0.01, size=(num_words, pre_dim)).astype(np.float32)

for word, idx in word_index.items():
    if idx >= num_words:
        continue
    if word in kv:
        embedding_matrix_pre[idx] = kv[word]

grid_pre = [
    {"embed_dim": pre_dim, "lstm_hidden": 128, "num_layers": 1, "dropout": 0.3, "lr": 1e-3, "batch_size": 64, "epochs": 4, "trainable_embed": False},
    {"embed_dim": pre_dim, "lstm_hidden": 128, "num_layers": 2, "dropout": 0.3, "lr": 1e-3, "batch_size": 64, "epochs": 4, "trainable_embed": False},
]

best_val_acc_pre = 0.0
best_params_pre = None
best_model_pre = None

for p in grid_pre:
    name = f"{pre_name}_LSTM_{p}"
    logger.info(f"=== Training {name} ===")
    model = build_lstm(
        input_dim=num_words, embed_dim=p["embed_dim"], lstm_hidden=p["lstm_hidden"], num_layers=p["num_layers"],
        dropout=p["dropout"], embedding_matrix=embedding_matrix_pre, trainable_embed=p["trainable_embed"],
        time_steps=MAX_LEN
    )
    hist = train_lstm(model, X_train_seq_model, y_train_model, X_val_seq, y_val,
                      lr=p["lr"], batch_size=p["batch_size"], epochs=p["epochs"], name=name)
    val_acc = max(hist.history["val_accuracy"])
    if val_acc > best_val_acc_pre:
        best_val_acc_pre = val_acc
        best_params_pre = p
        best_model_pre = model

logger.info(f"[{pre_name}] Best val accuracy: {best_val_acc_pre:.4f} | Params: {best_params_pre}")

y_pred_prob = best_model_pre.predict(X_test_seq, batch_size=128).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)
metrics_pre = evaluate_and_log(pre_name, y_test, y_pred)


In [None]:
# Chunk helper: split review into fixed-size word chunks
def chunk_text(text: str, chunk_size: int = 32, max_chunks: int = 8):
    tokens = text.split()
    chunks = []
    for i in range(0, min(len(tokens), chunk_size * max_chunks), chunk_size):
        chunk = " ".join(tokens[i:i+chunk_size])
        if len(chunk) > 0:
            chunks.append(chunk)
    # pad with empty chunks
    while len(chunks) < max_chunks:
        chunks.append("")
    return chunks[:max_chunks]

# Build chunked sequences
CHUNK_SIZE = 32
MAX_CHUNKS = 8  # sequence length for LSTM in this experiment
logger.info("[BERT] Preparing chunked inputs...")
X_train_chunks = [chunk_text(t, CHUNK_SIZE, MAX_CHUNKS) for t in X_train]
X_test_chunks = [chunk_text(t, CHUNK_SIZE, MAX_CHUNKS) for t in X_test]
X_val_chunks = X_train_chunks[:n_val]
X_train_chunks_model = X_train_chunks[n_val:]

# Encode chunks with sentence-transformers (each chunk -> 384-d for all-MiniLM-L6-v2)
st = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
logger.info("[BERT] Encoding chunks for train/val/test...")
def encode_chunks(chunk_lists):
    # Flatten to encode in batches, then reshape back
    flat = [c for chunks in chunk_lists for c in chunks]
    embs = st.encode(flat, batch_size=128, convert_to_numpy=True, show_progress_bar=True)
    # reshape: (num_docs, MAX_CHUNKS, emb_dim)
    emb_dim = embs.shape[1]
    arr = embs.reshape((len(chunk_lists), MAX_CHUNKS, emb_dim))
    return arr, emb_dim

X_train_bert_seq, bert_dim = encode_chunks(X_train_chunks_model)
X_val_bert_seq, _ = encode_chunks(X_val_chunks)
X_test_bert_seq, _ = encode_chunks(X_test_chunks)

grid_bert = [
    {"lstm_hidden": 128, "num_layers": 1, "dropout": 0.3, "lr": 1e-3, "batch_size": 32, "epochs": 4},
    {"lstm_hidden": 128, "num_layers": 2, "dropout": 0.3, "lr": 1e-3, "batch_size": 32, "epochs": 4},
]

best_val_acc_bert = 0.0
best_params_bert = None
best_model_bert = None

for p in grid_bert:
    name = f"BERT(chunks)_LSTM_{p}"
    logger.info(f"=== Training {name} ===")
    model = build_lstm(
        input_dim=None, embed_dim=None, lstm_hidden=p["lstm_hidden"], num_layers=p["num_layers"],
        dropout=p["dropout"], embedding_matrix=None, trainable_embed=False,
        time_steps=MAX_CHUNKS, feature_dim=bert_dim  # feature sequences (chunk embeddings)
    )
    hist = train_lstm(model, X_train_bert_seq, y_train_model, X_val_bert_seq, y_val,
                      lr=p["lr"], batch_size=p["batch_size"], epochs=p["epochs"], name=name)
    val_acc = max(hist.history["val_accuracy"])
    if val_acc > best_val_acc_bert:
        best_val_acc_bert = val_acc
        best_params_bert = p
        best_model_bert = model

logger.info(f"[BERT(chunks)] Best val accuracy: {best_val_acc_bert:.4f} | Params: {best_params_bert}")

y_pred_prob = best_model_bert.predict(X_test_bert_seq, batch_size=128).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)
metrics_bert = evaluate_and_log("BERT chunked embeddings", y_test, y_pred)


In [17]:
# Collect metrics and best params
results = [
    metrics_tfidf_uni,
    metrics_tfidf_unibi,
    metrics_w2v,
    metrics_pre,
    metrics_bert
]

best_params = [
    {"Embedding": "TF-IDF (unigram) seq", "Best Val Acc": float(f"{best_val_acc_tfidf_uni:.4f}"), "Best Params": best_params_tfidf_uni},
    {"Embedding": "TF-IDF (unigram+bigram) seq", "Best Val Acc": float(f"{best_val_acc_tfidf_unibi:.4f}"), "Best Params": best_params_tfidf_unibi},
    {"Embedding": "Word2Vec (trainable)", "Best Val Acc": float(f"{best_val_acc_w2v:.4f}"), "Best Params": best_params_w2v},
    {"Embedding": f"{pre_name}", "Best Val Acc": float(f"{best_val_acc_pre:.4f}"), "Best Params": best_params_pre},
    {"Embedding": "BERT chunked embeddings", "Best Val Acc": float(f"{best_val_acc_bert:.4f}"), "Best Params": best_params_bert},
]

df_metrics = pd.DataFrame(results)
df_params = pd.DataFrame(best_params)

# Merge and present key columns
df_final = df_params[["Embedding", "Best Val Acc", "Best Params"]].merge(
    df_metrics[["Embedding", "Accuracy", "F1"]],
    on="Embedding",
    how="left"
)

df_final = df_final.sort_values(by="F1", ascending=False).reset_index(drop=True)

print("\n=== Final Comparison (sorted by Test F1) ===")
print(df_final.to_string(index=False))



=== Final Comparison (sorted by Test F1) ===
                       Embedding  Best Val Acc                                                                                                                                   Best Params  Accuracy       F1
         BERT chunked embeddings        0.8116                                             {'lstm_hidden': 128, 'num_layers': 2, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 32, 'epochs': 4}   0.83896 0.836766
            Word2Vec (trainable)        0.7824  {'embed_dim': 200, 'lstm_hidden': 128, 'num_layers': 1, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 64, 'epochs': 4, 'trainable_embed': True}   0.82064 0.809256
Pretrained Word2Vec (GoogleNews)        0.8916 {'embed_dim': 300, 'lstm_hidden': 128, 'num_layers': 2, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 64, 'epochs': 4, 'trainable_embed': False}   0.65964 0.725188
     TF-IDF (unigram+bigram) seq        0.0228                                              {'lstm_hidden': 64, 'num_layer