Imports

In [None]:
import json
import argparse
from pathlib import Path
from collections import Counter
from typing import Dict, List
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

try:
    import google.colab

    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    path_prefix = '/content/drive/MyDrive/UofT DL Term project/GreBerta-experiment-2/'
else:
    path_prefix = ''

Mounted at /content/drive


Fine-tune GreBerta

In [None]:
def load_pos_data(filepath: Path) -> List[Dict]:
    """Load and extract POS tagging data from alignment JSON"""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    pos_data = []
    for verse in data['data']:
        tokens = [t['word'] for t in verse['greek_tokens']]
        pos_tags = [t['pos'] for t in verse['greek_tokens']]

        if tokens:  # Skip empty verses
            pos_data.append({
                'tokens': tokens,
                'pos_tags': pos_tags,
                'verse_id': verse['verse_id']
            })

    return pos_data


def create_label_mapping(train_data: List[Dict]) -> Dict[str, int]:
    """Create mapping from POS tags to integer IDs"""
    all_tags = set()
    for example in train_data:
        all_tags.update(example['pos_tags'])

    # Sort for consistency
    sorted_tags = sorted(all_tags)
    tag_to_id = {tag: i for i, tag in enumerate(sorted_tags)}
    id_to_tag = {i: tag for tag, i in tag_to_id.items()}

    return tag_to_id, id_to_tag


def tokenize_and_align_labels(examples, tokenizer, tag_to_id):
    """
    Tokenize text and align POS labels with subword tokens.

    When a word is split into subwords (e.g., 'λόγος' -> ['λ', '##όγος']),
    we assign the label to the first subword and -100 to the rest (ignored in loss).
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding=False,
        max_length=512
    )

    labels = []
    for i, label_list in enumerate(examples['pos_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            # Special tokens get -100 (ignored in loss)
            if word_idx is None:
                label_ids.append(-100)
            # First subword of each word gets the label
            elif word_idx != previous_word_idx:
                label_ids.append(tag_to_id[label_list[word_idx]])
            # Other subwords get -100
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs


def compute_metrics(eval_pred, id_to_tag):
    """Compute accuracy and per-class metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id_to_tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten for metrics
    flat_predictions = [tag for sent in true_predictions for tag in sent]
    flat_labels = [tag for sent in true_labels for tag in sent]

    accuracy = accuracy_score(flat_labels, flat_predictions)

    return {
        'accuracy': accuracy,
        'num_examples': len(flat_labels)
    }


def train_pos_tagger():
    # Fine-tune GreBerta for POS tagging
    args = {
        'epochs': 3,  # Number of training epochs
        'batch_size': 16,  # Training batch size
        'lr': 2e-5,  # Learning rate
        'model_name': 'bowphs/GreBerta',  # Base model name
        'output_dir': path_prefix + 'pos_tagger_output', # Output directory
    }

    print("=" * 80)
    print("FINE-TUNING GREBERTA FOR POS TAGGING")
    print("=" * 80)

    # 1. Load data
    print("\n1. Loading data...")
    train_data = load_pos_data(Path(path_prefix + 'data/train.json'))
    dev_data = load_pos_data(Path(path_prefix + 'data/dev.json'))
    test_data = load_pos_data(Path(path_prefix + 'data/test.json'))

    print(f"  Train: {len(train_data):,} verses")
    print(f"  Dev:   {len(dev_data):,} verses")
    print(f"  Test:  {len(test_data):,} verses")

    # 2. Create label mapping
    print("\n2. Creating label mapping...")
    tag_to_id, id_to_tag = create_label_mapping(train_data)
    num_labels = len(tag_to_id)
    print(f"  Found {num_labels} POS tags:")
    for tag, idx in sorted(tag_to_id.items(), key=lambda x: x[1]):
        # Count occurrences in train
        count = sum(1 for ex in train_data for t in ex['pos_tags'] if t == tag)
        print(f"    {idx:2d}. {tag:5s} ({count:,} tokens)")

    # 3. Load tokenizer and model
    print(f"\n3. Loading {args['model_name']}...")
    tokenizer = AutoTokenizer.from_pretrained(args['model_name'], add_prefix_space=True)
    model = AutoModelForTokenClassification.from_pretrained(
        args['model_name'],
        num_labels=num_labels,
        id2label=id_to_tag,
        label2id=tag_to_id
    )
    print(f"  ✓ Model loaded with {num_labels} labels")
    print(f"  ✓ Model has {sum(p.numel() for p in model.parameters()):,} parameters")

    # 4. Create datasets
    print("\n4. Creating Hugging Face datasets...")
    train_dataset = Dataset.from_list(train_data)
    dev_dataset = Dataset.from_list(dev_data)
    test_dataset = Dataset.from_list(test_data)

    # Tokenize
    print("  Tokenizing...")
    train_dataset = train_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer, tag_to_id),
        batched=True,
        remove_columns=['tokens', 'pos_tags', 'verse_id']
    )
    dev_dataset = dev_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer, tag_to_id),
        batched=True,
        remove_columns=['tokens', 'pos_tags', 'verse_id']
    )
    test_dataset = test_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer, tag_to_id),
        batched=True,
        remove_columns=['tokens', 'pos_tags', 'verse_id']
    )
    print(f"  ✓ Tokenized {len(train_dataset):,} training examples")

    # 5. Setup training
    print(f"\n5. Setting up training...")
    print(f"  Epochs: {args['epochs']}")
    print(f"  Batch size: {args['batch_size']}")
    print(f"  Learning rate: {args['lr']}")
    print(f"  Output dir: {args['output_dir']}")

    training_args = TrainingArguments(
        output_dir=args['output_dir'],
        learning_rate=args['lr'],
        per_device_train_batch_size=args['batch_size'],
        per_device_eval_batch_size=args['batch_size'],
        num_train_epochs=args['epochs'],
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        push_to_hub=False,
        logging_dir=f'{args['output_dir']}/logs',
        logging_steps=50,
        report_to="none" # Added to prevent W&B login prompt
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda x: compute_metrics(x, id_to_tag),
    )

    # 6. Train!
    print("\n" + "=" * 80)
    print("6. TRAINING STARTED")
    print("=" * 80)

    train_result = trainer.train()

    print("\n" + "=" * 80)
    print("TRAINING COMPLETE!")
    print("=" * 80)
    print(f"\nTraining metrics:")
    for key, value in train_result.metrics.items():
        print(f"  {key}: {value}")

    # 7. Evaluate on dev set
    print("\n7. Evaluating on dev set...")
    dev_results = trainer.evaluate(eval_dataset=dev_dataset)
    print(f"Dev accuracy: {dev_results['eval_accuracy']:.4f}")

    # 8. Evaluate on test set
    print("\n8. Evaluating on test set...")
    test_results = trainer.evaluate(eval_dataset=test_dataset)
    print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")

    # 9. Save model
    print(f"\n9. Saving model to {args['output_dir']}...")
    trainer.save_model(args['output_dir'])
    tokenizer.save_pretrained(args['output_dir'])

    # Save label mappings
    import json
    with open(Path(args['output_dir']) / 'label_mapping.json', 'w') as f:
        json.dump({'tag_to_id': tag_to_id, 'id_to_tag': id_to_tag}, f, indent=2)

    print("\n" + "=" * 80)
    print("✓ FINE-TUNING COMPLETE!")
    print("=" * 80)
    print(f"\nModel saved to: {args['output_dir']}")
    print(f"Dev accuracy:   {dev_results['eval_accuracy']:.4f}")
    print(f"Test accuracy:  {test_results['eval_accuracy']:.4f}")
    print("\nTo use the model:")
    print(f"  from transformers import AutoModelForTokenClassification, AutoTokenizer")
    print(f"  model = AutoModelForTokenClassification.from_pretrained('{args['output_dir']}')")
    print(f"  tokenizer = AutoTokenizer.from_pretrained('{args['output_dir']}')")
    print("=" * 80)


train_pos_tagger()

FINE-TUNING GREBERTA FOR POS TAGGING

1. Loading data...
  Train: 7,198 verses
  Dev:   284 verses
  Test:  443 verses

2. Creating label mapping...
  Found 13 POS tags:
     0. A-    (7,860 tokens)
     1. C-    (16,112 tokens)
     2. D-    (5,655 tokens)
     3. I-    (15 tokens)
     4. N-    (24,573 tokens)
     5. P-    (9,662 tokens)
     6. RA    (17,088 tokens)
     7. RD    (1,579 tokens)
     8. RI    (1,102 tokens)
     9. RP    (10,446 tokens)
    10. RR    (1,490 tokens)
    11. V-    (25,398 tokens)
    12. X-    (905 tokens)

3. Loading bowphs/GreBerta...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ✓ Model loaded with 13 labels
  ✓ Model has 125,397,517 parameters

4. Creating Hugging Face datasets...
  Tokenizing...


Map:   0%|          | 0/7198 [00:00<?, ? examples/s]

Map:   0%|          | 0/284 [00:00<?, ? examples/s]

Map:   0%|          | 0/443 [00:00<?, ? examples/s]

  ✓ Tokenized 7,198 training examples

5. Setting up training...
  Epochs: 3
  Batch size: 16
  Learning rate: 2e-05
  Output dir: /content/drive/MyDrive/UofT DL Term project/GreBerta-experiment-2/pos_tagger_output


  trainer = Trainer(



6. TRAINING STARTED


Epoch,Training Loss,Validation Loss,Accuracy,Num Examples
1,0.0405,0.04778,0.987786,5158
2,0.0236,0.034769,0.988561,5158
3,0.0127,0.035012,0.9905,5158



TRAINING COMPLETE!

Training metrics:
  train_runtime: 109.1574
  train_samples_per_second: 197.824
  train_steps_per_second: 12.367
  total_flos: 429776895059172.0
  train_loss: 0.09515314415649131
  epoch: 3.0

7. Evaluating on dev set...


Dev accuracy: 0.9905

8. Evaluating on test set...
Test accuracy: 0.9928

9. Saving model to /content/drive/MyDrive/UofT DL Term project/GreBerta-experiment-2/pos_tagger_output...

✓ FINE-TUNING COMPLETE!

Model saved to: /content/drive/MyDrive/UofT DL Term project/GreBerta-experiment-2/pos_tagger_output
Dev accuracy:   0.9905
Test accuracy:  0.9928

To use the model:
  from transformers import AutoModelForTokenClassification, AutoTokenizer
  model = AutoModelForTokenClassification.from_pretrained('/content/drive/MyDrive/UofT DL Term project/GreBerta-experiment-2/pos_tagger_output')
  tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/UofT DL Term project/GreBerta-experiment-2/pos_tagger_output')


Word Alignment

In [None]:
import json
import argparse
import random
from pathlib import Path
from typing import Dict, List, Tuple
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup
)
import numpy as np
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, classification_report

@dataclass
class AlignmentExample:
    """Single verse with alignment pairs"""
    verse_id: str
    greek_tokens: List[str]
    english_tokens: List[str]
    alignments: List[Tuple[int, int]]  # (greek_idx, english_idx) pairs


class AlignmentModel(nn.Module):
    """Cross-lingual word alignment model"""

    def __init__(self, greek_model_name='bowphs/GreBerta',
                 english_model_name='bert-base-uncased',
                 hidden_dim=256):
        super().__init__()

        # Encoders
        self.greek_encoder = AutoModel.from_pretrained(greek_model_name)
        self.english_encoder = AutoModel.from_pretrained(english_model_name)

        # Get hidden sizes
        greek_hidden = self.greek_encoder.config.hidden_size
        english_hidden = self.english_encoder.config.hidden_size

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(greek_hidden + english_hidden, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 2)  # Binary: aligned or not
        )

    def forward(self, greek_input_ids, greek_attention_mask,
                english_input_ids, english_attention_mask,
                greek_indices, english_indices):
        """
        Args:
            greek_input_ids: [batch_size, max_greek_len]
            greek_attention_mask: [batch_size, max_greek_len]
            english_input_ids: [batch_size, max_english_len]
            english_attention_mask: [batch_size, max_english_len]
            greek_indices: [batch_size, num_pairs] - which Greek token for each pair
            english_indices: [batch_size, num_pairs] - which English token for each pair

        Returns:
            logits: [batch_size, num_pairs, 2] - alignment scores
        """
        # Encode Greek
        greek_outputs = self.greek_encoder(
            input_ids=greek_input_ids,
            attention_mask=greek_attention_mask
        )
        greek_embeddings = greek_outputs.last_hidden_state  # [batch, greek_len, hidden]

        # Encode English
        english_outputs = self.english_encoder(
            input_ids=english_input_ids,
            attention_mask=english_attention_mask
        )
        english_embeddings = english_outputs.last_hidden_state  # [batch, eng_len, hidden]

        # Gather embeddings for specified pairs
        batch_size, num_pairs = greek_indices.shape

        # Get Greek embeddings for each pair
        greek_pair_embeddings = torch.gather(
            greek_embeddings,
            dim=1,
            index=greek_indices.unsqueeze(-1).expand(-1, -1, greek_embeddings.size(-1))
        )  # [batch, num_pairs, greek_hidden]

        # Get English embeddings for each pair
        english_pair_embeddings = torch.gather(
            english_embeddings,
            dim=1,
            index=english_indices.unsqueeze(-1).expand(-1, -1, english_embeddings.size(-1))
        )  # [batch, num_pairs, english_hidden]

        # Concatenate embeddings
        combined = torch.cat([greek_pair_embeddings, english_pair_embeddings], dim=-1)

        # Classify each pair
        logits = self.classifier(combined)  # [batch, num_pairs, 2]

        return logits


class AlignmentDataset(Dataset):
    """Dataset for word alignment training"""

    def __init__(self, examples: List[AlignmentExample],
                 greek_tokenizer, english_tokenizer,
                 max_pairs_per_verse=50):
        self.examples = examples
        self.greek_tokenizer = greek_tokenizer
        self.english_tokenizer = english_tokenizer
        self.max_pairs_per_verse = max_pairs_per_verse

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]

        # Tokenize Greek (join with spaces)
        greek_text = ' '.join(example.greek_tokens)
        greek_encoded = self.greek_tokenizer(
            greek_text,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        # Tokenize English
        english_text = ' '.join(example.english_tokens)
        english_encoded = self.english_tokenizer(
            english_text,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        # Map original word indices to subword indices
        greek_word_to_token = self._get_word_to_token_map(
            example.greek_tokens, greek_encoded
        )
        english_word_to_token = self._get_word_to_token_map(
            example.english_tokens, english_encoded
        )

        # Create training pairs
        # Positive examples: actual alignments
        positive_pairs = []
        for greek_idx, english_idx in example.alignments:
            if greek_idx in greek_word_to_token and english_idx in english_word_to_token:
                positive_pairs.append((
                    greek_word_to_token[greek_idx],
                    english_word_to_token[english_idx],
                    1  # label: aligned
                ))

        # Negative examples: random non-aligned pairs
        num_negatives = min(len(positive_pairs) * 2, self.max_pairs_per_verse)
        negative_pairs = []

        all_greek_indices = list(greek_word_to_token.values())
        all_english_indices = list(english_word_to_token.values())

        # Create set of positive pairs for fast lookup
        positive_set = {(g, e) for g, e, _ in positive_pairs}

        attempts = 0
        while len(negative_pairs) < num_negatives and attempts < num_negatives * 10:
            g_idx = random.choice(all_greek_indices)
            e_idx = random.choice(all_english_indices)
            if (g_idx, e_idx) not in positive_set:
                negative_pairs.append((g_idx, e_idx, 0))  # label: not aligned
            attempts += 1

        # Combine and shuffle
        all_pairs = positive_pairs + negative_pairs
        random.shuffle(all_pairs)

        # Limit total pairs
        all_pairs = all_pairs[:self.max_pairs_per_verse]

        if not all_pairs:
            # Create dummy pair if no valid pairs
            all_pairs = [(1, 1, 0)]

        greek_indices = torch.tensor([p[0] for p in all_pairs], dtype=torch.long)
        english_indices = torch.tensor([p[1] for p in all_pairs], dtype=torch.long)
        labels = torch.tensor([p[2] for p in all_pairs], dtype=torch.long)

        return {
            'greek_input_ids': greek_encoded['input_ids'].squeeze(0),
            'greek_attention_mask': greek_encoded['attention_mask'].squeeze(0),
            'english_input_ids': english_encoded['input_ids'].squeeze(0),
            'english_attention_mask': english_encoded['attention_mask'].squeeze(0),
            'greek_indices': greek_indices,
            'english_indices': english_indices,
            'labels': labels,
            'verse_id': example.verse_id
        }

    def _get_word_to_token_map(self, words, encoded):
        """Map word indices to their first subword token index"""
        word_to_token = {}
        word_ids = encoded.word_ids()

        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is not None and word_idx not in word_to_token:
                word_to_token[word_idx] = token_idx

        return word_to_token


def collate_fn(batch):
    """Custom collate function for batching"""
    # Find max lengths
    max_greek_len = max(item['greek_input_ids'].size(0) for item in batch)
    max_english_len = max(item['english_input_ids'].size(0) for item in batch)
    max_pairs = max(item['labels'].size(0) for item in batch)

    # Pad everything
    greek_input_ids = []
    greek_attention_mask = []
    english_input_ids = []
    english_attention_mask = []
    greek_indices = []
    english_indices = []
    labels = []
    verse_ids = []

    for item in batch:
        # Pad Greek
        g_len = item['greek_input_ids'].size(0)
        greek_input_ids.append(
            torch.cat([item['greek_input_ids'],
                      torch.zeros(max_greek_len - g_len, dtype=torch.long)])
        )
        greek_attention_mask.append(
            torch.cat([item['greek_attention_mask'],
                      torch.zeros(max_greek_len - g_len, dtype=torch.long)])
        )

        # Pad English
        e_len = item['english_input_ids'].size(0)
        english_input_ids.append(
            torch.cat([item['english_input_ids'],
                      torch.zeros(max_english_len - e_len, dtype=torch.long)])
        )
        english_attention_mask.append(
            torch.cat([item['english_attention_mask'],
                      torch.zeros(max_english_len - e_len, dtype=torch.long)])
        )

        # Pad pairs
        num_pairs = item['labels'].size(0)
        greek_indices.append(
            torch.cat([item['greek_indices'],
                      torch.zeros(max_pairs - num_pairs, dtype=torch.long)])
        )
        english_indices.append(
            torch.cat([item['english_indices'],
                      torch.zeros(max_pairs - num_pairs, dtype=torch.long)])
        )
        labels.append(
            torch.cat([item['labels'],
                      torch.full((max_pairs - num_pairs,), -100, dtype=torch.long)])  # -100 = ignore
        )

        verse_ids.append(item['verse_id'])

    return {
        'greek_input_ids': torch.stack(greek_input_ids),
        'greek_attention_mask': torch.stack(greek_attention_mask),
        'english_input_ids': torch.stack(english_input_ids),
        'english_attention_mask': torch.stack(english_attention_mask),
        'greek_indices': torch.stack(greek_indices),
        'english_indices': torch.stack(english_indices),
        'labels': torch.stack(labels),
        'verse_ids': verse_ids
    }


def load_alignment_data(filepath: Path) -> List[AlignmentExample]:
    """Load alignment data from JSON"""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    examples = []
    for verse in data['data']:
        if not verse['alignments']:
            continue

        greek_tokens = [t['word'] for t in verse['greek_tokens']]
        english_tokens = [t['word'] for t in verse['english_tokens']]
        alignments = [(a['greek_idx'], a['english_idx'])
                     for a in verse['alignments']]

        examples.append(AlignmentExample(
            verse_id=verse['verse_id'],
            greek_tokens=greek_tokens,
            english_tokens=english_tokens,
            alignments=alignments
        ))

    return examples


def train_epoch(model, dataloader, optimizer, scheduler, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc='Training')

    for batch in progress_bar:
        # Move to device
        greek_input_ids = batch['greek_input_ids'].to(device)
        greek_attention_mask = batch['greek_attention_mask'].to(device)
        english_input_ids = batch['english_input_ids'].to(device)
        english_attention_mask = batch['english_attention_mask'].to(device)
        greek_indices = batch['greek_indices'].to(device)
        english_indices = batch['english_indices'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits = model(
            greek_input_ids, greek_attention_mask,
            english_input_ids, english_attention_mask,
            greek_indices, english_indices
        )

        # Compute loss (only on valid pairs)
        loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_fn(logits.view(-1, 2), labels.view(-1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        # Track metrics
        total_loss += loss.item()

        # Get predictions
        preds = torch.argmax(logits, dim=-1)
        valid_mask = labels != -100
        all_preds.extend(preds[valid_mask].cpu().numpy())
        all_labels.extend(labels[valid_mask].cpu().numpy())

        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary'
    )

    return {
        'loss': total_loss / len(dataloader),
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


def evaluate(model, dataloader, device):
    """Evaluate model"""
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            greek_input_ids = batch['greek_input_ids'].to(device)
            greek_attention_mask = batch['greek_attention_mask'].to(device)
            english_input_ids = batch['english_input_ids'].to(device)
            english_attention_mask = batch['english_attention_mask'].to(device)
            greek_indices = batch['greek_indices'].to(device)
            english_indices = batch['english_indices'].to(device)
            labels = batch['labels'].to(device)

            logits = model(
                greek_input_ids, greek_attention_mask,
                english_input_ids, english_attention_mask,
                greek_indices, english_indices
            )

            preds = torch.argmax(logits, dim=-1)
            valid_mask = labels != -100
            all_preds.extend(preds[valid_mask].cpu().numpy())
            all_labels.extend(labels[valid_mask].cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary'
    )

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


def train_alignment():
    args = {
        'epochs': 3,
        'batch_size': 8,
        'lr': 2e-5,
        'output_dir': path_prefix+'alignment_model_output_colab'
    }

    print("=" * 80)
    print("TRAINING WORD ALIGNMENT MODEL")
    print("=" * 80)

    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"\nUsing device: {device}")

    # Load data
    print("\n1. Loading data...")
    train_examples = load_alignment_data(Path(path_prefix+'data/train.json'))
    dev_examples = load_alignment_data(Path(path_prefix+'data/dev.json'))
    test_examples = load_alignment_data(Path(path_prefix+'data/test.json'))

    print(f"  Train: {len(train_examples):,} verses with alignments")
    print(f"  Dev:   {len(dev_examples):,} verses")
    print(f"  Test:  {len(test_examples):,} verses")

    # Load tokenizers
    print("\n2. Loading tokenizers...")
    greek_tokenizer = AutoTokenizer.from_pretrained('bowphs/GreBerta')
    english_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    print("  ✓ Tokenizers loaded")

    # Create datasets
    print("\n3. Creating datasets...")
    train_dataset = AlignmentDataset(train_examples, greek_tokenizer, english_tokenizer)
    dev_dataset = AlignmentDataset(dev_examples, greek_tokenizer, english_tokenizer)
    test_dataset = AlignmentDataset(test_examples, greek_tokenizer, english_tokenizer)

    train_dataloader = DataLoader(
        train_dataset, batch_size=args['batch_size'],
        shuffle=True, collate_fn=collate_fn
    )
    dev_dataloader = DataLoader(
        dev_dataset, batch_size=args['batch_size'],
        shuffle=False, collate_fn=collate_fn
    )
    test_dataloader = DataLoader(
        test_dataset, batch_size=args['batch_size'],
        shuffle=False, collate_fn=collate_fn
    )

    print(f"  ✓ Created {len(train_dataloader):,} training batches")

    # Create model
    print("\n4. Creating model...")
    model = AlignmentModel()
    model = model.to(device)

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  ✓ Model created")
    print(f"  Total parameters: {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")

    # Setup training
    print("\n5. Setting up training...")
    optimizer = torch.optim.AdamW(model.parameters(), lr=args['lr'])
    num_training_steps = len(train_dataloader) * args['epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_training_steps // 10,
        num_training_steps=num_training_steps
    )

    print(f"  Epochs: {args['epochs']}")
    print(f"  Batch size: {args['batch_size']}")
    print(f"  Learning rate: {args['lr']}")
    print(f"  Output dir: {args['output_dir']}")

    # Train!
    print("\n" + "=" * 80)
    print("6. TRAINING STARTED")
    print("=" * 80)

    best_f1 = 0
    for epoch in range(args['epochs']):
        print(f"\nEpoch {epoch + 1}/{args['epochs']}")
        print("-" * 80)

        # Train
        train_metrics = train_epoch(model, train_dataloader, optimizer, scheduler, device)
        print(f"Train - Loss: {train_metrics['loss']:.4f}, "
              f"P: {train_metrics['precision']:.4f}, "
              f"R: {train_metrics['recall']:.4f}, "
              f"F1: {train_metrics['f1']:.4f}")

        # Evaluate
        dev_metrics = evaluate(model, dev_dataloader, device)
        print(f"Dev   - P: {dev_metrics['precision']:.4f}, "
              f"R: {dev_metrics['recall']:.4f}, "
              f"F1: {dev_metrics['f1']:.4f}")

        # Save best model
        if dev_metrics['f1'] > best_f1:
            best_f1 = dev_metrics['f1']
            output_dir = Path(args['output_dir'])
            output_dir.mkdir(exist_ok=True)
            torch.save(model.state_dict(), output_dir / 'best_model.pt')
            print(f"  ✓ Saved best model (F1: {best_f1:.4f})")

    # Final evaluation
    print("\n" + "=" * 80)
    print("7. FINAL EVALUATION")
    print("=" * 80)

    # Load best model
    model.load_state_dict(torch.load(Path(args['output_dir']) / 'best_model.pt'))

    print("\nDev set:")
    dev_metrics = evaluate(model, dev_dataloader, device)
    print(f"  Precision: {dev_metrics['precision']:.4f}")
    print(f"  Recall:    {dev_metrics['recall']:.4f}")
    print(f"  F1 Score:  {dev_metrics['f1']:.4f}")

    print("\nTest set:")
    test_metrics = evaluate(model, test_dataloader, device)
    print(f"  Precision: {test_metrics['precision']:.4f}")
    print(f"  Recall:    {test_metrics['recall']:.4f}")
    print(f"  F1 Score:  {test_metrics['f1']:.4f}")

    # Save final artifacts
    print(f"\n8. Saving model and tokenizers...")
    output_dir = Path(args['output_dir'])
    output_dir.mkdir(exist_ok=True)

    torch.save(model.state_dict(), output_dir / 'model.pt')
    greek_tokenizer.save_pretrained(output_dir / 'greek_tokenizer')
    english_tokenizer.save_pretrained(output_dir / 'english_tokenizer')

    # Save config
    config = {
        'greek_model': 'bowphs/GreBerta',
        'english_model': 'bert-base-uncased',
        'dev_f1': dev_metrics['f1'],
        'test_f1': test_metrics['f1'],
    }
    with open(output_dir / 'config.json', 'w') as f:
        json.dump(config, f, indent=2)

    print("\n" + "=" * 80)
    print("✓ TRAINING COMPLETE!")
    print("=" * 80)
    print(f"\nModel saved to: {args['output_dir']}")
    print(f"Test F1 Score: {test_metrics['f1']:.4f}")
    print("\nTo use the model, see test_alignment.py")
    print("=" * 80)


train_alignment()

TRAINING WORD ALIGNMENT MODEL

Using device: cuda

1. Loading data...
  Train: 5,846 verses with alignments
  Dev:   284 verses
  Test:  380 verses

2. Loading tokenizers...
  ✓ Tokenizers loaded

3. Creating datasets...
  ✓ Created 731 training batches

4. Creating model...


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ✓ Model created
  Total parameters: 235,886,978
  Trainable parameters: 235,886,978

5. Setting up training...
  Epochs: 3
  Batch size: 8
  Learning rate: 2e-05
  Output dir: /content/drive/MyDrive/UofT DL Term project/GreBerta-experiment-2/alignment_model_output_colab

6. TRAINING STARTED

Epoch 1/3
--------------------------------------------------------------------------------


Training: 100%|██████████| 731/731 [02:44<00:00,  4.45it/s, loss=0.3639]


Train - Loss: 0.4655, P: 0.7482, R: 0.5743, F1: 0.6498


Evaluating: 100%|██████████| 36/36 [00:00<00:00, 39.19it/s]


Dev   - P: 0.7770, R: 0.7843, F1: 0.7806
  ✓ Saved best model (F1: 0.7806)

Epoch 2/3
--------------------------------------------------------------------------------


Training: 100%|██████████| 731/731 [02:45<00:00,  4.41it/s, loss=0.2221]


Train - Loss: 0.2964, P: 0.8195, R: 0.8487, F1: 0.8338


Evaluating: 100%|██████████| 36/36 [00:00<00:00, 39.02it/s]


Dev   - P: 0.8122, R: 0.8456, F1: 0.8286
  ✓ Saved best model (F1: 0.8286)

Epoch 3/3
--------------------------------------------------------------------------------


Training: 100%|██████████| 731/731 [02:44<00:00,  4.43it/s, loss=0.2497]


Train - Loss: 0.2486, P: 0.8485, R: 0.8846, F1: 0.8662


Evaluating: 100%|██████████| 36/36 [00:00<00:00, 38.75it/s]


Dev   - P: 0.8337, R: 0.8552, F1: 0.8443
  ✓ Saved best model (F1: 0.8443)

7. FINAL EVALUATION

Dev set:


Evaluating: 100%|██████████| 36/36 [00:00<00:00, 38.10it/s]


  Precision: 0.8390
  Recall:    0.8619
  F1 Score:  0.8503

Test set:


Evaluating: 100%|██████████| 48/48 [00:01<00:00, 31.18it/s]


  Precision: 0.8835
  Recall:    0.9108
  F1 Score:  0.8969

8. Saving model and tokenizers...

✓ TRAINING COMPLETE!

Model saved to: /content/drive/MyDrive/UofT DL Term project/GreBerta-experiment-2/alignment_model_output_colab
Test F1 Score: 0.8969

To use the model, see test_alignment.py
