In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from helpers.ud_english_ewt_dataset import build_dataloader
from model import POSTagger

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
BATCH_SIZE = 16
EPOCHS = 100

In [4]:
word_df = pd.read_csv("datasets/unified_vocab.tsv", sep='\t')
word_vocab = {row['token']: row['index'] for _, row in word_df.iterrows()}

# Load tag vocabulary from tsv
tag_df = pd.read_csv("datasets/tag_vocab.tsv", sep='\t')
tag_vocab = {row['tag']: row['index'] for _, row in tag_df.iterrows()}

In [5]:
train_loader, val_loader = build_dataloader(
    "datasets/train_improved.tsv", 
    batch_size=BATCH_SIZE,
    split=True,
    shuffle=True,
    val_ratio=0.3
)

test_loader = build_dataloader(
    "datasets/test_improved.tsv",
    batch_size=BATCH_SIZE,
    split=False,
    shuffle=False,
)

Dataset split: 8781 training samples, 3762 validation samples


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = POSTagger(vocab_size=len(word_vocab), tag_count=len(tag_vocab), emb_dim=64, hidden_dim=32)
model.to(device)



POSTagger(
  (embed): Embedding(19553, 64)
  (dropout_emb): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(64, 32, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout_lstm): Dropout(p=0.5, inplace=False)
  (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (fc): Linear(in_features=64, out_features=17, bias=True)
)

In [7]:
criterion = nn.CrossEntropyLoss()  # tag classification loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

tensor(0.)

In [8]:
def create_mask(lengths, max_len):
    mask = torch.zeros(len(lengths), max_len, device=lengths.device, dtype=torch.bool)
    for i, length in enumerate(lengths):
        # Clamp length to valid range
        valid_length = min(max(length.item(), 0), max_len)
        mask[i, :valid_length] = 1
    return mask

In [9]:
# 5. Train the model
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for inputs, targets, lengths in train_loader:
        inputs, targets, lengths = inputs.to(device), targets.to(device), lengths.to(device)
        optimizer.zero_grad()
        logits = model(inputs)     # (seq_len, tag_count)

        B, T = inputs.size()
        # Create mask for real tokens (non-padding)
        mask = create_mask(lengths, T)
        
        # Apply mask to compute loss only on real tokens
        logits_masked = logits[mask]
        targets_masked = targets[mask]
        
        # Calculate loss on masked data
        loss = criterion(logits_masked, targets_masked)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * B
    avg_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}: training loss = {avg_loss:.3f}")
    # Validate on validation set
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for inputs, targets, lengths in test_loader:
            inputs, targets, lengths = inputs.to(device), targets.to(device), lengths.to(device)
            logits = model(inputs)

            B, T = inputs.size()

            # Create mask for real tokens (non-padding)
            mask = create_mask(lengths, T)
            
            # Apply mask to compute loss only on real tokens
            logits_masked = logits[mask]
            targets_masked = targets[mask]
            
            # Calculate loss on masked data
            loss = criterion(logits_masked, targets_masked)
            total_loss += loss.item() * B
            loss = total_loss / len(val_loader.dataset)
        print(f"Epoch {epoch+1}: validation loss = {loss:.3f}")

Epoch 1: training loss = 1.805
Epoch 1: validation loss = 0.586
Epoch 2: training loss = 1.225
Epoch 2: validation loss = 0.464
Epoch 3: training loss = 1.011
Epoch 3: validation loss = 0.402
Epoch 4: training loss = 0.878
Epoch 4: validation loss = 0.355
Epoch 5: training loss = 0.778
Epoch 5: validation loss = 0.317
Epoch 6: training loss = 0.690
Epoch 6: validation loss = 0.283
Epoch 7: training loss = 0.615
Epoch 7: validation loss = 0.253
Epoch 8: training loss = 0.552
Epoch 8: validation loss = 0.229
Epoch 9: training loss = 0.490
Epoch 9: validation loss = 0.210
Epoch 10: training loss = 0.438
Epoch 10: validation loss = 0.197
Epoch 11: training loss = 0.397
Epoch 11: validation loss = 0.187
Epoch 12: training loss = 0.363
Epoch 12: validation loss = 0.179
Epoch 13: training loss = 0.336
Epoch 13: validation loss = 0.176
Epoch 14: training loss = 0.314
Epoch 14: validation loss = 0.171
Epoch 15: training loss = 0.298
Epoch 15: validation loss = 0.168
Epoch 16: training loss = 0.

In [10]:
# 6. Evaluate on test set
model.eval()
correct_tokens = 0
total_tokens = 0
with torch.no_grad():
    for inputs, targets, lengths in test_loader:
        inputs, targets, lengths = inputs.to(device), targets.to(device), lengths.to(device)
        batch_size, max_len = inputs.size()
        
        # Forward pass (without passing lengths if not needed by model architecture)
        logits = model(inputs)  # Shape: [batch_size, max_len, num_tags]
        
        # Get predicted tags
        pred_tags = logits.argmax(dim=-1)  # Shape: [batch_size, max_len]
        
        # Create mask for real tokens (non-padding)
        mask = create_mask(lengths, max_len)
        
        # Count correct predictions only on real tokens
        correct_tokens += ((pred_tags == targets) & mask).sum().item()
        total_tokens += mask.sum().item()

accuracy = correct_tokens / total_tokens
print(f"Test Accuracy: {accuracy*100:.2f}%")
print(f"Correct tokens: {correct_tokens} / {total_tokens}")

Test Accuracy: 91.37%
Correct tokens: 22911 / 25074
