# HW2-1 Text classification

### preprocessing

In [3]:
import spacy
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
import json

json_file = './1_data/News_train.json'
dataset = []
with open(json_file, 'r') as f:
    for line in f:
        if line.strip():
            dataset.append(json.loads(line))

print(f"Loaded {len(dataset)} samples")
print(f"Example: {dataset[0]}")

# Load spaCy tokenizer
nlp = spacy.load("en_core_web_sm")

tokenized_texts = []
labels = []
vocab_counter = Counter()

for item in dataset:
    text = f"{item['headline']} {item['short_description']}"
    label = item['label']

    tokens = [token.text.lower() for token in nlp(text)]
    tokenized_texts.append(tokens)
    labels.append(label)

    vocab_counter.update(tokens)

# Build vocab
vocab = {"<PAD>": 0, "<UNK>": 1}
for word in vocab_counter:
    vocab[word] = len(vocab)

print(f"Vocab size: {len(vocab)}")

max_len = max(len(tokens) for tokens in tokenized_texts)
sequences = []

for tokens in tokenized_texts:
    seq = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    seq += [vocab["<PAD>"]] * (max_len - len(seq))
    sequences.append(seq)

# Convert input, label to tensor
input_tensor = torch.tensor(sequences)
labels_tensor = torch.tensor(labels)

print(f"Input tensor shape: {input_tensor.shape}")
print(f"Labels tensor shape: {labels_tensor.shape}")

Loaded 135608 samples
Example: {'id': 0, 'headline': 'Trump Officials Repeatedly Violated Hatch Act, Probe Finds', 'short_description': 'At least 13 former Trump administration officials violated the law by intermingling campaigning with their official government duties, according to a new investigation.', 'label': 0.0}
Vocab size: 75541
Input tensor shape: torch.Size([135608, 310])
Labels tensor shape: torch.Size([135608])


In [4]:
from torch.utils.data import Dataset, DataLoader

class NewsDataset(Dataset):
    def __init__(self, input_tensor, labels_tensor):
        self.inputs = input_tensor
        self.labels = labels_tensor

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]
    
class TestNewsDataset(Dataset):
    def __init__(self, iput_tensor, ids):
        self.inputs = input_tensor
        self.ids = ids

    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.ids[idx]
    
dataset = NewsDataset(input_tensor, labels_tensor)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

### Build Transformer 

In [5]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x
    
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.3):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        
    def forward(self, src):
        # src shape: (seq_len, batch_size, d_model)
        src2, _ = self.self_attn(src, src, src)
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src
    
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=4, num_layers=2, num_classes=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.embedding_dropout = nn.Dropout(dropout)
        self.pos_encoder = PositionalEncoding(d_model)
        
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, nhead, dropout=dropout) for _ in range(num_layers)
        ])

        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, src):
        # src shape: (batch_size, seq_len)
        src = self.embedding(src) # (batch_size, seq_len, d_model)
        src = self.embedding_dropout(src)
        src = self.pos_encoder(src)

        src = src.permute(1, 0, 2) # (seq_len, batch_size, d_model)

        for layer in self.layers:
            src = layer(src)

        # mean pooling
        src = src.mean(dim=0) # (batch_size, d_model)
        logits = self.classifier(src)
        return logits

### Train example

In [6]:
# Hyperparameters
vocab_size = len(vocab)
d_model = 32
nhead = 2
num_layers = 1
num_classes = len(set(labels_tensor.tolist()))

model = TransformerClassifier(vocab_size, d_model, nhead, num_layers, num_classes, dropout=0.3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch_inputs, batch_labels in dataloader:
        batch_inputs = batch_inputs.to(device)
        batch_labels = batch_labels.long()
        batch_labels = batch_labels.to(device)
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        correct += (predicted == batch_labels).sum().item()
        total += batch_labels.size(0)

    avg_loss = total_loss / len(dataloader)
    acc = correct / total 
    print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | Accuracy: {acc*100:.4f}%")

Using device: cuda
Epoch 1/20 | Loss: 1.8030 | Accuracy: 44.6758%
Epoch 2/20 | Loss: 1.1515 | Accuracy: 64.8575%
Epoch 3/20 | Loss: 0.9317 | Accuracy: 71.6720%
Epoch 4/20 | Loss: 0.8254 | Accuracy: 74.8171%
Epoch 5/20 | Loss: 0.7554 | Accuracy: 76.8413%
Epoch 6/20 | Loss: 0.7036 | Accuracy: 78.1503%
Epoch 7/20 | Loss: 0.6596 | Accuracy: 79.4673%
Epoch 8/20 | Loss: 0.6209 | Accuracy: 80.4709%
Epoch 9/20 | Loss: 0.5878 | Accuracy: 81.3787%
Epoch 10/20 | Loss: 0.5579 | Accuracy: 82.0505%
Epoch 11/20 | Loss: 0.5304 | Accuracy: 82.8587%
Epoch 12/20 | Loss: 0.5050 | Accuracy: 83.5710%
Epoch 13/20 | Loss: 0.4849 | Accuracy: 84.1897%
Epoch 14/20 | Loss: 0.4659 | Accuracy: 84.7354%
Epoch 15/20 | Loss: 0.4496 | Accuracy: 85.2494%
Epoch 16/20 | Loss: 0.4341 | Accuracy: 85.7678%
Epoch 17/20 | Loss: 0.4214 | Accuracy: 86.0937%
Epoch 18/20 | Loss: 0.4107 | Accuracy: 86.2840%
Epoch 19/20 | Loss: 0.4026 | Accuracy: 86.5546%
Epoch 20/20 | Loss: 0.3901 | Accuracy: 87.0413%


### Inference

In [7]:
import json
import csv

test_file = './1_data/News_test.json'

with open(test_file, 'r') as f:
    test_dataset = []
    for line in f:
        if line.strip():
            test_dataset.append(json.loads(line))

print(f"Loaded {len(test_dataset)} test samples")

test_tokenized_texts = []
test_ids = []

for item in test_dataset:
    text = f"{item['headline']} {item['short_description']}"
    tokens = [token.text.lower() for token in nlp(text)]
    test_tokenized_texts.append(tokens)
    test_ids.append(item['id'])

test_sequences = []
for tokens in test_tokenized_texts:
    seq = [vocab.get(token, vocab.get("<UNK>")) for token in tokens]
    seq += [vocab["<PAD>"]] * (max_len - len(seq))
    test_sequences.append(seq)

test_input_tensor = torch.tensor(test_sequences)

model.eval()

test_input_tensor = test_input_tensor.to(device)

test_dataset = TestNewsDataset(test_input_tensor, test_ids)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

results = []

with torch.no_grad():
    for batch_inputs, batch_ids in test_loader:
        batch_inputs = batch_inputs.to(device)
        outputs = model(batch_inputs)
        _, predicted = torch.max(outputs, 1)
        for id_, label in zip(batch_ids, predicted.cpu()):
            results.append((id_.item(), label.item()))

output_file = 'predictions.csv'

with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["ID", "label"])
    for id_, label in results:
        writer.writerow([id_, label])

print(f"Predictions saved to {output_file}")

Loaded 1000 test samples
Predictions saved to predictions.csv


In [32]:
num_unk_train = sum(seq.count(vocab.get("<UNK>", 1)) for seq in input_tensor.tolist())
num_unk_test = sum(seq.count(vocab.get("<UNK>", 1)) for seq in test_input_tensor.tolist())
print(f"OOV in train: {num_unk_train}, OOV in test: {num_unk_test}")


OOV in train: 0, OOV in test: 330


In [34]:
total_tokens = sum(len(seq) for seq in test_input_tensor.tolist())
oov_rate = num_unk_test / total_tokens * 100
print(f"OOV rate: {oov_rate:.2f}%")


OOV rate: 0.11%


In [35]:
# Build reverse vocab
reverse_vocab = {v: k for k, v in vocab.items()}

oov_words = set()
for tokens in test_tokenized_texts:
    for token in tokens:
        if token not in vocab:
            oov_words.add(token)

print(f"Sample OOV words: {list(oov_words)[:20]}")


Sample OOV words: ['balkovec', 'coinbase', 'urie', 'blayre', 'dwen', 'roscosmos', 'noth', 'yung', 'phillie', "k'waun", 'caan', 'zelenskyy', 'uyghurs', 'swalwell', 'schnapp', 'waititi', 'armrests', 'naturedly', 'shinn', 'meylemans']


In [33]:
from collections import Counter
print(Counter(labels_tensor.tolist()))

Counter({0.0: 31948, 1.0: 17914, 2.0: 14489, 4.0: 9782, 3.0: 9418, 5.0: 8773, 8.0: 6309, 7.0: 5601, 6.0: 5265, 9.0: 5114, 10.0: 4624, 11.0: 4332, 13.0: 4311, 12.0: 4172, 14.0: 3556})
