In [1]:
import re
import numpy as np
import pandas as pd
import random
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
seed = 42
random.seed(seed); np.random.seed(seed)

torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

rows = []

### Load and Prepare Data


In [3]:
with open("rt-polarity.neg", encoding="utf-8", errors="ignore") as f:
    neg = f.read().splitlines()
with open("rt-polarity.pos", encoding="utf-8", errors="ignore") as f:
    pos = f.read().splitlines()

df = pd.DataFrame({
    "text": neg + pos,
    "label": [0] * len(neg) + [1] * len(pos)
})

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)


### Method 1: TF-IDF + Logistic Regression (Baseline)


In [4]:
# Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train logistic regression classifier
clf = LogisticRegression(max_iter=100000)
clf.fit(X_train_tfidf, y_train)
y_pred_lr = clf.predict(X_test_tfidf)

rows.append({
    "Model": "TF-IDF + LogisticRegression",
    "Accuracy": accuracy_score(y_test, y_pred_lr),
    "Precision": precision_score(y_test, y_pred_lr, average="macro"),
    "Recall": recall_score(y_test, y_pred_lr, average="macro"),
    "F1": f1_score(y_test, y_pred_lr, average="macro"),
})

### Method 2: GloVe Embeddings + MLP

In [5]:

glove_path = "glove.6B.100d.txt"
glove = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        glove[word] = vec
dim = 100  # embedding dimension

# Preprocessing function
def preprocess(text):
    return re.sub(r"[^a-zA-Z']", " ", text.lower()).split()

# Convert sentence to averaged embedding vector
def text_to_vec(text):
    tokens = preprocess(text)
    vecs = [glove[t] for t in tokens if t in glove]
    return np.mean(vecs, axis=0) if vecs else np.zeros(dim)

# Convert all training and test samples
X_train_glove = np.array([text_to_vec(t) for t in X_train])
X_test_glove = np.array([text_to_vec(t) for t in X_test])

# Define MLP model
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(dim, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )
    def forward(self, x):
        return self.fc(x)

# Train MLP
model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

X_train_tensor = torch.tensor(X_train_glove, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_glove, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()

# Evaluate
model.eval()
preds = model(X_test_tensor).argmax(dim=1)

y_pred_mlp = preds.numpy() if isinstance(preds, torch.Tensor) else preds
rows.append({
    "Model": "GloVe(avg) + MLP",
    "Accuracy": accuracy_score(y_test, y_pred_mlp),
    "Precision": precision_score(y_test, y_pred_mlp, average="macro"),
    "Recall": recall_score(y_test, y_pred_mlp, average="macro"),
    "F1": f1_score(y_test, y_pred_mlp, average="macro"),
})


### Method 3: LSTM with Custom Vocabulary

In [6]:

# Tokenize and build vocabulary
tokenized = [preprocess(t) for t in X_train]
counter = Counter()
for tokens in tokenized:
    counter.update(tokens)

# Create word2idx mapping
min_freq = 2
word2idx = {"<PAD>": 0, "<UNK>": 1}
for word, freq in counter.items():
    if freq >= min_freq:
        word2idx[word] = len(word2idx)

# Encode function
def encode(text):
    return torch.tensor([word2idx.get(t, 1) for t in preprocess(text)], dtype=torch.long)

class LSTMDataset(Dataset):
    def __init__(self, texts, labels):
        self.samples = [encode(t) for t in texts]
        self.labels = torch.tensor(labels.values, dtype=torch.long)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.samples[idx], self.labels[idx]

def collate(batch):
    x, y = zip(*batch)
    x_pad = pad_sequence(x, batch_first=True)
    return x_pad, torch.tensor(y)

train_ds = LSTMDataset(X_train, y_train)
test_ds = LSTMDataset(X_test, y_test)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate)
test_dl = DataLoader(test_ds, batch_size=32, collate_fn=collate)

# Define LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 2)
    def forward(self, x):
        emb = self.embed(x)
        _, (h, _) = self.lstm(emb)
        return self.fc(h[-1])

model = LSTMClassifier(vocab_size=len(word2idx), embed_dim=100, hidden_dim=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(50):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Evaluation
model.eval()
all_preds, all_true = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        out = model(xb)
        preds = out.argmax(dim=1)
        all_preds.extend(preds.tolist())
        all_true.extend(yb.tolist())

rows.append({
    "Model": "LSTM",
    "Accuracy": accuracy_score(all_true, all_preds),
    "Precision": precision_score(all_true, all_preds, average="macro"),
    "Recall": recall_score(all_true, all_preds, average="macro"),
    "F1": f1_score(all_true, all_preds, average="macro"),
})

Epoch 1, Loss: 185.3014
Epoch 2, Loss: 184.5990
Epoch 3, Loss: 179.9871
Epoch 4, Loss: 164.5979
Epoch 5, Loss: 145.9251
Epoch 6, Loss: 118.2156
Epoch 7, Loss: 97.2376
Epoch 8, Loss: 77.6609
Epoch 9, Loss: 63.5090
Epoch 10, Loss: 52.9595
Epoch 11, Loss: 46.1913
Epoch 12, Loss: 39.0386
Epoch 13, Loss: 33.5620
Epoch 14, Loss: 30.2397
Epoch 15, Loss: 27.8349
Epoch 16, Loss: 25.4276
Epoch 17, Loss: 24.8146
Epoch 18, Loss: 24.6040
Epoch 19, Loss: 21.6577
Epoch 20, Loss: 24.1308
Epoch 21, Loss: 23.1204
Epoch 22, Loss: 18.8445
Epoch 23, Loss: 16.2720
Epoch 24, Loss: 16.8855
Epoch 25, Loss: 16.9305
Epoch 26, Loss: 12.1012
Epoch 27, Loss: 10.0117
Epoch 28, Loss: 13.6865
Epoch 29, Loss: 8.6229
Epoch 30, Loss: 7.2081
Epoch 31, Loss: 7.4539
Epoch 32, Loss: 10.2089
Epoch 33, Loss: 7.6111
Epoch 34, Loss: 8.8411
Epoch 35, Loss: 6.2861
Epoch 36, Loss: 6.0536
Epoch 37, Loss: 3.9208
Epoch 38, Loss: 4.1198
Epoch 39, Loss: 3.4508
Epoch 40, Loss: 4.4561
Epoch 41, Loss: 7.4784
Epoch 42, Loss: 8.5329
Epoch 43

In [7]:
summary = pd.DataFrame(rows).round(4).sort_values("F1", ascending=False)
print("\n=== Summary (macro metrics) ===")
print(summary.to_string(index=False))


=== Summary (macro metrics) ===
                      Model  Accuracy  Precision  Recall     F1
TF-IDF + LogisticRegression    0.7712     0.7714  0.7712 0.7712
                       LSTM    0.7459     0.7461  0.7459 0.7459
           GloVe(avg) + MLP    0.7271     0.7271  0.7271 0.7271
