# RNN Text Classification with Multiple Embeddings
This notebook trains a SimpleRNN model using three embedding techniques: TF-IDF, Word2Vec Skip-gram, and Word2Vec CBOW.

It follows a shared preprocessing pipeline and reports comparable metrics for each embedding.

In [None]:
# Core libraries
import os
import re
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Keras / TensorFlow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Gensim for Word2Vec
from gensim.models import Word2Vec

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
# Load dataset
data_path = os.path.join("..", "data", "Reviews.csv")
df = pd.read_csv(data_path)
print(df.shape)
df.head()

In [None]:
# Select text and label columns with a robust fallback
def pick_columns(dataframe):
    text_col_candidates = ["Text", "text", "review", "review_text", "sentence"]
    label_col_candidates = ["Score", "label", "sentiment", "Sentiment", "target"]

    text_col = next((c for c in text_col_candidates if c in dataframe.columns), None)
    label_col = next((c for c in label_col_candidates if c in dataframe.columns), None)

    if text_col is None:
        raise ValueError("No text column found. Update text_col_candidates.")
    if label_col is None:
        raise ValueError("No label column found. Update label_col_candidates.")
    return text_col, label_col

text_col, label_col = pick_columns(df)
print("Using text column:", text_col)
print("Using label column:", label_col)

data = df[[text_col, label_col]].dropna().copy()

# Binarize if label column is a review score
if label_col.lower() == "score":
    data = data[data[label_col] != 3]
    data["label"] = (data[label_col] >= 4).astype(int)
else:
    # Assume labels are already binary or can be mapped
    unique_labels = data[label_col].unique()
    if set(unique_labels) <= {0, 1}:
        data["label"] = data[label_col].astype(int)
    else:
        # Simple mapping for common sentiment labels
        label_map = {"negative": 0, "positive": 1, "neg": 0, "pos": 1}
        data["label"] = data[label_col].astype(str).str.lower().map(label_map)
        data = data.dropna(subset=["label"]).copy()
        data["label"] = data["label"].astype(int)

data.rename(columns={text_col: "text"}, inplace=True)
data = data[["text", "label"]]
print(data.head())
print(data["label"].value_counts())

In [None]:
# Basic cleaning
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

data["clean_text"] = data["text"].apply(clean_text)
data["text_len"] = data["clean_text"].str.split().apply(len)
data.head()

In [None]:
# EDA: class balance and text length
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
data["label"].value_counts().plot(kind="bar", ax=axes[0], title="Class Distribution")
data["text_len"].plot(kind="hist", bins=50, ax=axes[1], title="Text Length (tokens)")
plt.tight_layout()
plt.show()

In [None]:
# Train/val/test split
X = data["clean_text"].values
y = data["label"].values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
 )
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp
 )

print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))

In [None]:
# Shared utilities
def evaluate_binary(y_true, y_pred, label="model"):
    y_hat = (y_pred >= 0.5).astype(int)
    acc = accuracy_score(y_true, y_hat)
    f1 = f1_score(y_true, y_hat)
    print(f"{label} Accuracy: {acc:.4f} | F1: {f1:.4f}")
    print(classification_report(y_true, y_hat))
    cm = confusion_matrix(y_true, y_hat)
    return acc, f1, cm

def build_rnn(input_shape, rnn_units=64, dropout=0.2):
    model = Sequential([
        SimpleRNN(rnn_units, input_shape=input_shape),
        Dropout(dropout),
        Dense(1, activation="sigmoid"),
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

def train_with_early_stop(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=128):
    callback = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[callback],
        verbose=1,
    )
    return history

In [None]:
# RNN with TF-IDF features
MAX_FEATURES = 2000

tfidf = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

# Convert to dense and reshape as a pseudo-sequence: (samples, timesteps, 1)
X_train_tfidf_rnn = X_train_tfidf.toarray().reshape(-1, MAX_FEATURES, 1)
X_val_tfidf_rnn = X_val_tfidf.toarray().reshape(-1, MAX_FEATURES, 1)
X_test_tfidf_rnn = X_test_tfidf.toarray().reshape(-1, MAX_FEATURES, 1)

tfidf_rnn = build_rnn(input_shape=(MAX_FEATURES, 1), rnn_units=64)
_ = train_with_early_stop(tfidf_rnn, X_train_tfidf_rnn, y_train, X_val_tfidf_rnn, y_val, epochs=8)

tfidf_pred = tfidf_rnn.predict(X_test_tfidf_rnn).ravel()
tfidf_acc, tfidf_f1, tfidf_cm = evaluate_binary(y_test, tfidf_pred, label="TF-IDF RNN")

In [None]:
# RNN with Word2Vec embeddings (Skip-gram and CBOW)
MAX_VOCAB = 20000
MAX_LEN = 200
EMBED_DIM = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post", truncating="post")
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post", truncating="post")

word_index = tokenizer.word_index
vocab_size = min(MAX_VOCAB, len(word_index) + 1)

def train_word2vec(sentences, sg_mode=1):
    model = Word2Vec(
        sentences=sentences,
        vector_size=EMBED_DIM,
        window=5,
        min_count=2,
        workers=4,
        sg=sg_mode,
        seed=SEED,
        epochs=10,
    )
    return model

def build_embedding_matrix(w2v_model, word_index, vocab_size, embed_dim):
    matrix = np.zeros((vocab_size, embed_dim), dtype=np.float32)
    for word, idx in word_index.items():
        if idx >= vocab_size:
            continue
        if word in w2v_model.wv:
            matrix[idx] = w2v_model.wv[word]
    return matrix

def build_rnn_with_embedding(embedding_matrix, max_len):
    model = Sequential([
        Embedding(
            input_dim=embedding_matrix.shape[0],
            output_dim=embedding_matrix.shape[1],
            weights=[embedding_matrix],
            input_length=max_len,
            trainable=False,
        ),
        SimpleRNN(64),
        Dropout(0.2),
        Dense(1, activation="sigmoid"),
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

# Prepare tokenized sentences for Word2Vec training
train_tokens = [text.split() for text in X_train]

# Skip-gram (sg=1)
w2v_skip = train_word2vec(train_tokens, sg_mode=1)
skip_matrix = build_embedding_matrix(w2v_skip, word_index, vocab_size, EMBED_DIM)
skip_rnn = build_rnn_with_embedding(skip_matrix, MAX_LEN)
_ = train_with_early_stop(skip_rnn, X_train_pad, y_train, X_val_pad, y_val, epochs=8)
skip_pred = skip_rnn.predict(X_test_pad).ravel()
skip_acc, skip_f1, skip_cm = evaluate_binary(y_test, skip_pred, label="Word2Vec Skip-gram RNN")

# CBOW (sg=0)
w2v_cbow = train_word2vec(train_tokens, sg_mode=0)
cbow_matrix = build_embedding_matrix(w2v_cbow, word_index, vocab_size, EMBED_DIM)
cbow_rnn = build_rnn_with_embedding(cbow_matrix, MAX_LEN)
_ = train_with_early_stop(cbow_rnn, X_train_pad, y_train, X_val_pad, y_val, epochs=8)
cbow_pred = cbow_rnn.predict(X_test_pad).ravel()
cbow_acc, cbow_f1, cbow_cm = evaluate_binary(y_test, cbow_pred, label="Word2Vec CBOW RNN")

In [None]:
# Comparison table
results = pd.DataFrame(
    [
        {"Embedding": "TF-IDF", "Accuracy": tfidf_acc, "F1": tfidf_f1},
        {"Embedding": "Word2Vec Skip-gram", "Accuracy": skip_acc, "F1": skip_f1},
        {"Embedding": "Word2Vec CBOW", "Accuracy": cbow_acc, "F1": cbow_f1},
    ]
).sort_values(by="F1", ascending=False)

results