In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

# -------------------------------
# Data Loading
# -------------------------------
df = pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)
texts = df['headline']
labels = df['is_sarcastic']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# -------------------------------
# Tokenization & Vocabulary Building
# -------------------------------
def tokenize(text):
    # A simple tokenizer that lowercases and splits on whitespace.
    return text.lower().split()

# Build vocabulary from training data
all_tokens = []
for text in X_train:
    all_tokens.extend(tokenize(text))
counter = Counter(all_tokens)
# Reserve 0 for <PAD> and 1 for <UNK>
vocab = {token: idx+2 for idx, (token, count) in enumerate(counter.most_common())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

# -------------------------------
# Convert Texts to Sequences & Padding
# -------------------------------
def text_to_sequence(text, vocab):
    tokens = tokenize(text)
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

def pad_sequence(seq, max_length):
    if len(seq) < max_length:
        return seq + [vocab["<PAD>"]] * (max_length - len(seq))
    else:
        return seq[:max_length]

max_length = 20  # You can adjust this based on typical headline lengths

X_train_seq = [pad_sequence(text_to_sequence(text, vocab), max_length) for text in X_train]
X_test_seq = [pad_sequence(text_to_sequence(text, vocab), max_length) for text in X_test]

# -------------------------------
# Convert to Tensors & Create DataLoaders
# -------------------------------
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -------------------------------
# LSTM Model Definition
# -------------------------------
class LSTMSarcasmDetector(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout):
        super(LSTMSarcasmDetector, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab["<PAD>"])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                            batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        # text shape: [batch_size, max_length]
        embedded = self.embedding(text)  # [batch_size, max_length, embedding_dim]
        output, (hidden, cell) = self.lstm(embedded)
        # Use the last hidden state from the final LSTM layer
        hidden_last = hidden[-1]  # [batch_size, hidden_dim]
        out = self.fc(hidden_last)
        return self.sigmoid(out)

vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
n_layers = 2
dropout = 0.5

model = LSTMSarcasmDetector(vocab_size, embedding_dim, hidden_dim, n_layers, dropout)
model.to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -------------------------------
# Training Loop
# -------------------------------
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


# -------------------------------
# Evaluation
# -------------------------------
def evaluate(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            predictions = (outputs >= 0.5).float()
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())
    return np.array(all_preds).flatten(), np.array(all_labels).flatten()

preds, true_labels = evaluate(model, test_loader, device)
acc = accuracy_score(true_labels, preds)
print(f"Test Accuracy: {acc * 100:.2f}%")
print("Classification Report:")
print(classification_report(true_labels, preds))
print("Confusion Matrix:")
print(confusion_matrix(true_labels, preds))


# Evaluate training accuracy
train_preds, train_labels = evaluate(model, train_loader, device)
train_acc = accuracy_score(train_labels, train_preds)
print(f"Training Accuracy: {train_acc * 100:.2f}%")

Using device: cuda


Epoch 1/10: 100%|██████████| 668/668 [00:02<00:00, 321.58it/s]


Epoch 1/10, Loss: 0.5477


Epoch 2/10: 100%|██████████| 668/668 [00:02<00:00, 322.95it/s]


Epoch 2/10, Loss: 0.3909


Epoch 3/10: 100%|██████████| 668/668 [00:02<00:00, 322.44it/s]


Epoch 3/10, Loss: 0.2720


Epoch 4/10: 100%|██████████| 668/668 [00:02<00:00, 327.03it/s]


Epoch 4/10, Loss: 0.1795


Epoch 5/10: 100%|██████████| 668/668 [00:02<00:00, 325.55it/s]


Epoch 5/10, Loss: 0.1060


Epoch 6/10: 100%|██████████| 668/668 [00:02<00:00, 323.36it/s]


Epoch 6/10, Loss: 0.0657


Epoch 7/10: 100%|██████████| 668/668 [00:02<00:00, 317.77it/s]


Epoch 7/10, Loss: 0.0431


Epoch 8/10: 100%|██████████| 668/668 [00:02<00:00, 319.61it/s]


Epoch 8/10, Loss: 0.0314


Epoch 9/10: 100%|██████████| 668/668 [00:02<00:00, 320.88it/s]


Epoch 9/10, Loss: 0.0316


Epoch 10/10: 100%|██████████| 668/668 [00:02<00:00, 322.08it/s]


Epoch 10/10, Loss: 0.0219
Test Accuracy: 81.45%
Classification Report:
              precision    recall  f1-score   support

         0.0       0.79      0.91      0.85      2996
         1.0       0.86      0.69      0.77      2346

    accuracy                           0.81      5342
   macro avg       0.82      0.80      0.81      5342
weighted avg       0.82      0.81      0.81      5342

Confusion Matrix:
[[2731  265]
 [ 726 1620]]
Training Accuracy: 99.72%
