In [1]:
pip install datasets seqeval tqdm

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=8130d68b34f2b72d1158a5cb0d5a7d7d0d19ff15e6c2475048816501038569ae
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected pack

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import BertTokenizer, BertModel, logging, BertTokenizerFast
from seqeval.metrics import f1_score, classification_report
import numpy as np
from tqdm import tqdm
import warnings

# Ignore specific warnings
warnings.filterwarnings("ignore", message=".*Unable to register.*", category=UserWarning)
warnings.filterwarnings("ignore", message=".*Xet Storage.*", category=UserWarning)
warnings.filterwarnings("ignore", message=".*You are using `torch.load`.*", category=FutureWarning)


# Reduce verbosity of transformers warnings
logging.set_verbosity_error()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
# Use trust_remote_code=True to avoid the prompt
dataset = load_dataset("eriktks/conll2003", trust_remote_code=True)
print(f"Dataset loaded: {dataset.keys()}")

# --- Define NER tags with START/END ---
START_TAG = "<START>"
END_TAG = "<END>"
tag2idx = {
    'O': 0,
    'B-PER': 1, 'I-PER': 2,
    'B-ORG': 3, 'I-ORG': 4,
    'B-LOC': 5, 'I-LOC': 6,
    'B-MISC': 7, 'I-MISC': 8,
    START_TAG: 9, END_TAG: 10
}
idx2tag = {v: k for k, v in tag2idx.items()}
num_tags = len(tag2idx) # Now 11
print(f"Number of tags (including START/END): {num_tags}")

# Load tokenizer and BERT model
try:
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
    print("Using fast tokenizer")
except ImportError:
    print("Fast tokenizer not available, using standard tokenizer")
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased')

class NERDataset(Dataset):
    def __init__(self, dataset_split, tokenizer, tag2idx, max_len=128):
        self.dataset = dataset_split
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag2idx = tag2idx
        # Use index 0 ('O') for unknown tags encountered
        self.unknown_tag_idx = self.tag2idx['O']

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        words = self.dataset[idx]['tokens']
        original_tags = self.dataset[idx]['ner_tags']

        # Convert CoNLL numeric tags to string tags using the dataset's feature info
        conll_tag_names = self.dataset.features['ner_tags'].feature.names
        tags = [conll_tag_names[tag_idx] for tag_idx in original_tags]

        # Map string tags to our tag indices
        tag_ids = [self.tag2idx.get(tag, self.unknown_tag_idx) for tag in tags]

        # Tokenize words using the tokenizer
        # Important: Use is_split_into_words=True and get word_ids
        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_offsets_mapping=False # Not needed here, word_ids is better
        )

        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']

        # Align tags with tokenized input_ids using word_ids
        aligned_tags = [-100] * len(input_ids)  # Initialize with -100 (ignore index)
        word_ids = encoding.word_ids()

        previous_word_idx = None
        for i, word_idx in enumerate(word_ids):
            # Handle special tokens ([CLS], [SEP], [PAD]) which have word_idx = None
            if word_idx is None:
                aligned_tags[i] = -100
            # Handle tokens corresponding to actual words
            elif word_idx != previous_word_idx:
                # Only label the first token of a given word
                if word_idx < len(tag_ids):
                     aligned_tags[i] = tag_ids[word_idx]
                else:
                     # Should not happen if word_ids aligns correctly with input words
                     aligned_tags[i] = -100 # Or self.unknown_tag_idx if preferred, but -100 is standard
                previous_word_idx = word_idx
            else:
                 # Label subsequent tokens of the same word with -100
                 aligned_tags[i] = -100

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(aligned_tags, dtype=torch.long)
        }

# --- Updated Model: BERT + BiLSTM + CRF with START/END tags ---
class BERT_BiLSTM_CRF(nn.Module):
    def __init__(self, bert_model, lstm_hidden_dim, num_tags, tag2idx):
        super(BERT_BiLSTM_CRF, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.lstm = nn.LSTM(
            input_size=bert_model.config.hidden_size,
            hidden_size=lstm_hidden_dim,
            num_layers=1, # Reduced layers for simplicity, can be tuned
            bidirectional=True,
            batch_first=True
        )
        self.hidden2tag = nn.Linear(lstm_hidden_dim * 2, num_tags)

        self.tag2idx = tag2idx
        self.num_tags = num_tags
        self.start_tag_idx = self.tag2idx[START_TAG]
        self.end_tag_idx = self.tag2idx[END_TAG]

        # CRF transitions parameter
        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))

        # Constrain transitions:
        # - No transition *to* START_TAG
        # - No transition *from* END_TAG
        self.transitions.data[self.start_tag_idx, :] = -10000.0
        self.transitions.data[:, self.end_tag_idx] = -10000.0
        # Optional: Add BIO constraints later if needed

    def _forward_alg(self, feats, mask):
        batch_size, seq_len, tag_size = feats.size()
        assert tag_size == self.num_tags

        # Initialize alpha: log probability of being in state j at step 0
        # Set START_TAG probability to 0 (-inf for others)
        log_alpha = torch.full((batch_size, self.num_tags), -10000.0, device=device)
        log_alpha[:, self.start_tag_idx] = 0.0

        # Iterate through the sequence
        for t in range(seq_len):
            # Get emission scores for timestep t for all tags and batches
            emit_scores_t = feats[:, t].unsqueeze(1)  # (batch_size, 1, num_tags)

            # Get transition scores (independent of batch)
            # trans_scores[i, j] = transition score from tag i to tag j
            trans_scores = self.transitions.unsqueeze(0) # (1, num_tags, num_tags)

            # Combine previous alpha, transition, and emission scores
            # log_alpha has shape (batch_size, num_tags)
            # alpha_t[j] = log P(path ending at state j at time t)
            # next_alpha_t[k] = log P(path ending at state k at time t+1)
            #                 = logsumexp_j (alpha_t[j] + transition[j, k] + emission[k])
            log_alpha_t = log_alpha.unsqueeze(2) # (batch_size, num_tags, 1)
            next_log_alpha_t = log_alpha_t + trans_scores + emit_scores_t # (batch_size, num_tags, num_tags)
            next_log_alpha_t = torch.logsumexp(next_log_alpha_t, dim=1) # (batch_size, num_tags)

            # Apply mask: If mask is 0, keep the previous log_alpha
            mask_t = mask[:, t].unsqueeze(1).float() # (batch_size, 1)
            log_alpha = mask_t * next_log_alpha_t + (1 - mask_t) * log_alpha

        # Add final transition to END_TAG
        # Note: We consider the transition to END_TAG *after* the last emission score
        # has been incorporated into log_alpha at the final valid timestep.
        # The current log_alpha holds scores for paths ending at the last token.
        log_alpha += self.transitions[self.end_tag_idx, :].unsqueeze(0)

        # Log-sum-exp over the final scores for all tags to get the partition function Z(x)
        # This represents the log probability of all possible paths.
        log_partition_function = torch.logsumexp(log_alpha, dim=1) # (batch_size,)
        return log_partition_function

    def _score_sentence(self, feats, tags, mask):
        batch_size, seq_len = tags.size()
        assert feats.size(0) == batch_size and feats.size(1) == seq_len

        # Initialize score with transition from START_TAG to the first actual tag
        start_tags = torch.full((batch_size,), self.start_tag_idx, dtype=torch.long, device=device)
        # Get transition score from START to first tag (tags[:, 0])
        score = self.transitions[start_tags, tags[:, 0]]

        # Add emission score for the first tag, only if it's not masked
        score += torch.gather(feats[:, 0], 1, tags[:, 0].unsqueeze(1)).squeeze(1) * mask[:, 0].float()

        # Iterate through the rest of the sequence (from t=1 to seq_len-1)
        for t in range(1, seq_len):
            mask_t = mask[:, t].float()
            # Transition score from previous tag (tags[:, t-1]) to current tag (tags[:, t])
            trans_score = self.transitions[tags[:, t-1], tags[:, t]]
            # Emission score for the current tag (tags[:, t]) at timestep t
            emit_score = torch.gather(feats[:, t], 1, tags[:, t].unsqueeze(1)).squeeze(1)
            # Add scores only for non-masked positions
            score += mask_t * (trans_score + emit_score)

        # Add transition score to END_TAG from the last valid tag in the sequence
        # Find the index of the last valid token for each sequence in the batch
        last_valid_idx = mask.sum(dim=1).long() - 1 # (batch_size,)
        # Get the tag at the last valid index for each sequence
        # Need to handle cases where seq len is 0 (all masked) -> last_valid_idx = -1
        valid_seq_mask = last_valid_idx >= 0
        if valid_seq_mask.any():
            last_valid_tags = torch.gather(tags[valid_seq_mask], 1, last_valid_idx[valid_seq_mask].unsqueeze(1)).squeeze(1)
            # Add transition score from last_valid_tag to END_TAG
            score[valid_seq_mask] += self.transitions[last_valid_tags, self.end_tag_idx]

        return score

    def _viterbi_decode(self, feats, mask):
        batch_size, seq_len, tag_size = feats.size()
        assert tag_size == self.num_tags

        # Initialize Viterbi path scores (log probabilities)
        # Set START_TAG score to 0, others to -inf
        log_delta = torch.full((batch_size, self.num_tags), -10000.0, device=device)
        log_delta[:, self.start_tag_idx] = 0.0

        # Initialize backpointers matrix (to store the best previous tag index)
        psi = torch.zeros(batch_size, seq_len, self.num_tags, dtype=torch.long, device=device)

        # Iterate through the sequence
        for t in range(seq_len):
            # Get emission scores for timestep t
            emit_scores_t = feats[:, t].unsqueeze(1) # (batch_size, 1, num_tags)

            # Get transition scores
            trans_scores = self.transitions.unsqueeze(0) # (1, num_tags, num_tags)

            # Combine previous delta, transition, and emission scores
            # log_delta_t has shape (batch_size, num_tags)
            # log_delta_t[j] = max log P(best path ending at state j at time t)
            # next_log_delta_t[k] = max_j (log_delta_t[j] + transition[j, k] + emission[k])
            log_delta_t = log_delta.unsqueeze(2) # (batch_size, num_tags, 1)
            next_log_delta_t = log_delta_t + trans_scores + emit_scores_t # (batch_size, num_tags, num_tags)

            # Find the maximum score and the corresponding previous tag index (backpointer)
            max_log_delta_t, psi[:, t] = torch.max(next_log_delta_t, dim=1) # (batch_size, num_tags), (batch_size, num_tags)

            # Apply mask: If mask is 0, keep the previous log_delta
            mask_t = mask[:, t].unsqueeze(1).float() # (batch_size, 1)
            log_delta = mask_t * max_log_delta_t + (1 - mask_t) * log_delta

        # Add final transition to END_TAG
        log_delta += self.transitions[self.end_tag_idx, :].unsqueeze(0)

        # Find the best score and the tag index at the end of the sequence (before END transition)
        # The best score should correspond to the path ending in END_TAG, but we find the max over all tags
        # The tag index corresponds to the tag at the *last valid position* that leads to the best overall score.
        best_path_score, last_tag = torch.max(log_delta, dim=1) # (batch_size,), (batch_size,)

        # Backtrack using backpointers (psi) to find the best path
        best_paths = torch.zeros(batch_size, seq_len, dtype=torch.long, device=device)
        seq_ends = mask.sum(dim=1).long() # Length of each sequence

        for b in range(batch_size):
            # Start backtracking from the last valid position
            seq_end = seq_ends[b].item()
            if seq_end == 0: # Handle empty sequences after masking
                continue

            # The 'last_tag' is the best tag index at the last valid timestep (seq_end - 1)
            best_tag_b = last_tag[b].item()
            best_paths[b, seq_end - 1] = best_tag_b

            # Follow backpointers from t = seq_end - 1 down to 0
            for t in range(seq_end - 2, -1, -1):
                best_tag_b = psi[b, t + 1, best_tag_b].item()
                best_paths[b, t] = best_tag_b

        return best_path_score, best_paths

    def neg_log_likelihood(self, feats, tags, mask):
        # Calculate the log partition function (sum over all paths)
        forward_score = self._forward_alg(feats, mask)
        # Calculate the score of the true path
        gold_score = self._score_sentence(feats, tags, mask)

        # Negative log likelihood = log Z(x) - score(true_path)
        # Average over the batch
        return torch.mean(forward_score - gold_score)

    def forward(self, input_ids, attention_mask, labels=None):
        # Get BERT embeddings
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)

        # Pass through BiLSTM
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = self.dropout(lstm_output)

        # Project to tag space (get emission scores)
        emissions = self.hidden2tag(lstm_output) # (batch, seq_len, num_tags)

        if labels is not None:
            # --- Training/Validation ---
            # Create mask for loss calculation: ignore padding AND -100 labels
            # Mask should be 1 for valid tokens, 0 otherwise
            loss_mask = (attention_mask == 1) & (labels != -100)
            loss_mask = loss_mask.long() # Convert boolean mask to long/float for CRF methods

            # Clone labels and replace -100 with a valid index (e.g., 'O')
            # This index won't contribute to the loss because of the mask,
            # but CRF methods expect valid indices.
            valid_labels = labels.clone()
            valid_labels[labels == -100] = self.tag2idx['O'] # Replace -100 with 'O' index

            # Calculate negative log likelihood loss using the mask
            loss = self.neg_log_likelihood(emissions, valid_labels, loss_mask)

            # Decode best path using the same mask (for potential inspection during training)
            # Note: Decoding during training adds overhead. Usually only done for evaluation.
            # We return it here as the original code did.
            _, best_path = self._viterbi_decode(emissions, loss_mask)

            return loss, best_path
        else:
            # --- Inference ---
            # Use attention mask directly for decoding (assumes no -100 labels in input)
            # Mask should be 1 for non-padding tokens, 0 for padding
            inference_mask = attention_mask.long()
            _, best_path = self._viterbi_decode(emissions, inference_mask)
            return best_path

def prepare_datasets(tag2idx):
    # Prepare datasets
    train_dataset = NERDataset(dataset['train'], tokenizer, tag2idx)
    val_dataset = NERDataset(dataset['validation'], tokenizer, tag2idx)
    test_dataset = NERDataset(dataset['test'], tokenizer, tag2idx)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) # Reduced batch size
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    return train_loader, val_loader, test_loader

def train_model(model, train_loader, val_loader, epochs=4, learning_rate=3e-5): # Adjusted LR and epochs
    # Prepare optimizer
    # Separate parameters for BERT and other layers if desired (differential learning rates)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    # Scheduler (optional, but often helpful)
    # Example: Linear warmup and decay
    # num_training_steps = len(train_loader) * epochs
    # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9) # Simpler scheduler

    best_f1 = 0.0
    model.to(device)

    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            optimizer.zero_grad()
            loss, _ = model(input_ids, attention_mask, labels=labels) # Pass labels

            # Backward pass
            # Check for NaN/inf loss
            if torch.isnan(loss) or torch.isinf(loss):
                print(f"Warning: NaN or Inf loss detected: {loss.item()}. Skipping batch.")
                torch.cuda.empty_cache() # Clear cache if memory issues might be related
                continue

            loss.backward()
            # Gradient clipping (optional but recommended for stability)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            # scheduler.step() # Step scheduler per batch if using linear warmup/decay

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader) if len(train_loader) > 0 else 0
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}")

        # Validation
        val_f1 = evaluate(model, val_loader, idx2tag) # Pass idx2tag
        print(f"Epoch {epoch+1}/{epochs}, Validation F1: {val_f1:.4f}")

        # Save best model based on validation F1
        if val_f1 > best_f1:
            best_f1 = val_f1
            print(f"New best validation F1: {best_f1:.4f}. Saving model...")
            torch.save(model.state_dict(), 'best_ner_model.pt')

        scheduler.step() # Step scheduler per epoch if using StepLR

    print(f"Training finished. Best Validation F1: {best_f1:.4f}")
    return model

def evaluate(model, data_loader, idx2tag_map):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device) # Ground truth labels

            # Get predictions from the model (forward pass without labels)
            best_paths = model(input_ids, attention_mask) # Returns predicted tag indices

            # Process batch results
            for i in range(input_ids.size(0)):
                # Create mask to select valid tokens (not padding, not ignored -100 labels)
                valid_mask = (attention_mask[i] == 1) & (labels[i] != -100)

                # Get predicted tags for valid tokens
                valid_preds = best_paths[i][valid_mask]
                # Get ground truth tags for valid tokens
                valid_labels = labels[i][valid_mask]

                # Convert indices to tag names, handling potential out-of-bounds
                pred_tags = [idx2tag_map.get(tag_idx.item(), 'O') for tag_idx in valid_preds]
                gold_tags = [idx2tag_map.get(tag_idx.item(), 'O') for tag_idx in valid_labels]

                # Add the sequence tags to the overall lists
                # Ensure we don't add empty lists if a sequence had no valid tokens
                if pred_tags and gold_tags:
                    all_predictions.append(pred_tags)
                    all_labels.append(gold_tags)

    # Check if we collected any results
    if not all_labels or not all_predictions:
        print("Warning: No valid labels or predictions found during evaluation.")
        return 0.0

    try:
        # Calculate F1 score using seqeval
        # Ensure labels are in the correct BIO format if needed by seqeval (they should be)
        f1 = f1_score(all_labels, all_predictions, average='macro') # Use macro F1 for overall performance
        print("\nClassification Report (Validation/Test):")
        print(classification_report(all_labels, all_predictions, digits=4))
        return f1
    except Exception as e:
        print(f"Error calculating F1 score: {e}")
        # Print sample data for debugging if error occurs
        print("Sample Gold:", all_labels[0][:20] if all_labels else "N/A")
        print("Sample Pred:", all_predictions[0][:20] if all_predictions else "N/A")
        return 0.0


def main():
    # Set seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)

    # Prepare datasets using the updated tag mapping
    train_loader, val_loader, test_loader = prepare_datasets(tag2idx)
    
    # Initialize model with updated parameters
    model = BERT_BiLSTM_CRF(
        bert_model,
        lstm_hidden_dim=256,
        num_tags=num_tags, # Pass the updated num_tags (including START/END)
        tag2idx=tag2idx     # Pass the tag mapping
    )

    # Train model with potentially adjusted hyperparameters
    print("Starting training...")
    model = train_model(model, train_loader, val_loader, epochs=3, learning_rate=3e-5) # Reduced epochs, common LR

    # Load best model saved during training for final evaluation
    print("\nLoading best model for final test evaluation...")
    model.load_state_dict(torch.load('best_ner_model.pt'))
    model.to(device) # Ensure model is on the correct device after loading

    # Evaluate on the test set
    print("Evaluating on Test Set...")
    test_f1 = evaluate(model, test_loader, idx2tag) # Pass idx2tag
    print(f"\nFinal Test F1 Score (Macro): {test_f1:.4f}")

if __name__ == "__main__":
    main()

Using device: cuda
Dataset loaded: dict_keys(['train', 'validation', 'test'])
Number of tags (including START/END): 11
Using fast tokenizer
Starting training...


Training Epoch 1/3: 100%|██████████| 878/878 [05:10<00:00,  2.82it/s]


Epoch 1/3, Training Loss: 10003.9656


Evaluating: 100%|██████████| 204/204 [00:19<00:00, 10.53it/s]



Classification Report (Validation/Test):
              precision    recall  f1-score   support

         LOC     0.9706    0.9341    0.9520      1837
        MISC     0.8596    0.8633    0.8615       922
         ORG     0.8783    0.9038    0.8908      1341
         PER     0.9420    0.9739    0.9577      1836

   micro avg     0.9230    0.9286    0.9258      5936
   macro avg     0.9126    0.9188    0.9155      5936
weighted avg     0.9237    0.9286    0.9259      5936

Epoch 1/3, Validation F1: 0.9155
New best validation F1: 0.9155. Saving model...


Training Epoch 2/3: 100%|██████████| 878/878 [05:11<00:00,  2.82it/s]


Epoch 2/3, Training Loss: 9997.2098


Evaluating: 100%|██████████| 204/204 [00:19<00:00, 10.51it/s]



Classification Report (Validation/Test):
              precision    recall  f1-score   support

         LOC     0.9712    0.9352    0.9529      1837
        MISC     0.8720    0.9089    0.8901       922
         ORG     0.8611    0.9336    0.8959      1341
         PER     0.9762    0.9602    0.9681      1836

   micro avg     0.9301    0.9385    0.9343      5936
   macro avg     0.9201    0.9345    0.9267      5936
weighted avg     0.9324    0.9385    0.9350      5936

Epoch 2/3, Validation F1: 0.9267
New best validation F1: 0.9267. Saving model...


Training Epoch 3/3: 100%|██████████| 878/878 [05:12<00:00,  2.81it/s]


Epoch 3/3, Training Loss: 9994.4451


Evaluating: 100%|██████████| 204/204 [00:19<00:00, 10.37it/s]



Classification Report (Validation/Test):
              precision    recall  f1-score   support

         LOC     0.9643    0.9559    0.9601      1837
        MISC     0.8818    0.9143    0.8978       922
         ORG     0.8789    0.9366    0.9069      1341
         PER     0.9777    0.9542    0.9658      1836

   micro avg     0.9348    0.9446    0.9397      5936
   macro avg     0.9257    0.9403    0.9326      5936
weighted avg     0.9363    0.9446    0.9402      5936

Epoch 3/3, Validation F1: 0.9326
New best validation F1: 0.9326. Saving model...
Training finished. Best Validation F1: 0.9326

Loading best model for final test evaluation...
Evaluating on Test Set...


Evaluating: 100%|██████████| 216/216 [00:20<00:00, 10.66it/s]



Classification Report (Validation/Test):
              precision    recall  f1-score   support

         LOC     0.9272    0.9178    0.9225      1666
        MISC     0.7445    0.8177    0.7794       702
         ORG     0.8403    0.9121    0.8747      1661
         PER     0.9734    0.9276    0.9499      1615

   micro avg     0.8879    0.9064    0.8971      5644
   macro avg     0.8713    0.8938    0.8816      5644
weighted avg     0.8921    0.9064    0.8985      5644


Final Test F1 Score (Macro): 0.8816
