In [1]:
!pip install git+https://github.com/kmkurn/pytorch-crf.git

Collecting git+https://github.com/kmkurn/pytorch-crf.git
  Cloning https://github.com/kmkurn/pytorch-crf.git to /tmp/pip-req-build-n3lnukj1
  Running command git clone --filter=blob:none --quiet https://github.com/kmkurn/pytorch-crf.git /tmp/pip-req-build-n3lnukj1
  Resolved https://github.com/kmkurn/pytorch-crf.git to commit 623e3402d00a2728e99d6e8486010d67c754267b
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytorch-crf
  Building wheel for pytorch-crf (setup.py) ... [?25l[?25hdone
  Created wheel for pytorch-crf: filename=pytorch_crf-0.7.2-py3-none-any.whl size=6410 sha256=1c34c717aaeda2f41e418db9966a6e95725928e6ed0748797631650c4537fd43
  Stored in directory: /tmp/pip-ephem-wheel-cache-dhxzrify/wheels/39/5f/f6/4b48b35895d914f4f5fff5b600f87658c11693e37b6a4f118e
Successfully built pytorch-crf
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [2]:
# Experiment: Named Entity Recognition (NER) using BiLSTM-CRF with FastText Embeddings(Frozen)
# Configuration Settings:
# - Embedding Dimension: 300 (FastText)
# - Hidden Dimension: 256/128 (BiLSTM)
# - Batch Size: 64/32
# - Model: BiLSTM-CRF (Single NER MOdel/Joint Model for POS and NER)
# - Optimizer: Adam with L2 regularization (weight_decay=1e-5)
# - Learning Rate: 0.001
# - Dropout: 0.5
# - Early Stopping: Patience of 3 epochs
# - Training Epochs: up to 30
# - Dataset: Pre-split CoNLL format (train_v5.conll, val_v5.conll, test_v5.conll)
# - FastText Model: cc.my.300.bin (pre-trained FastText embeddings)
# - Evaluation Metrics: Precision, Recall, F1-Score for both POS and NER tasks
# - Hardware: NVIDIA Tesla T4 GPU

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchcrf import CRF
from sklearn.metrics import classification_report
from gensim.models.fasttext import load_facebook_model
import random

In [4]:
#embedding300/hiddendim256/batch64(Single NER Model)
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"  
fasttext_model = load_facebook_model(fasttext_bin_file)

# Extract the word vectors
fasttext_vectors = fasttext_model.wv
print(f"Loaded {len(fasttext_vectors)} words from FastText binary model.")

# Define Dataset Class (NER-only)
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, ner_tags = [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, ner_tag = [], []
            for line in f:
                if line.strip():
                    word, _, ner = line.strip().split("\t")  # Ignore POS tags
                    sentence.append(word)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        ner_tags.append(ner_tag)
                    sentence, ner_tag = [], []
            if sentence:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
        return sentences, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.ner_tags[idx]

# Collate function for dynamic padding (NER-only)
def collate_fn(batch):
    sentences, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    ner_tensors = []

    for s, n in zip(sentences, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model for NER-only
class BiLSTMCRF_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings):
        super(BiLSTMCRF_NER, self).__init__()
        # Initialize embedding layer with FastText embeddings (frozen)
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=True)  # Freeze embeddings
        self.dropout = nn.Dropout(0.5)  # Add dropout
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)  # Apply dropout
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        ner_logits = self.ner_fc(lstm_out)
        return ner_logits

    def compute_loss(self, x, ner_tags):
        ner_logits = self.forward(x)
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return ner_loss

    def decode(self, x):
        ner_logits = self.forward(x)
        ner_tags = self.ner_crf.decode(ner_logits)
        return ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
ner_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to NER tags

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, ner_tags in zip(dataset.sentences, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300  # FastText embedding dimension
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))  # Initialize with zeros

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])  # Use full 300 dimensions
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)  # Zero vector for padding
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)  # Random vector for unknown words

# Initialize model
hidden_dim = 256
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF_NER(vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Add weight decay for L2 regularization

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Training loop with early stopping
def train_model(model, train_loader, val_loader, test_loader, epochs):
    best_val_loss = float('inf')
    patience = 3
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, ner_tags in train_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, ner_tags)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, ner_tags in val_loader:
                sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, ner_tags).item()

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

    # Evaluate on test set after training
    model.eval()
    all_ner_preds, all_ner_targets = [], []
    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            # Flatten the predictions and targets, excluding <PAD> tokens
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                all_ner_preds.extend(ner_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_ner_targets.extend(ner_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length

    # Convert predictions and targets to tag names
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    # Filter out padding tokens from predictions and targets
    all_ner_preds_filtered = [idx_to_ner[idx] for idx in all_ner_preds]
    all_ner_targets_filtered = [idx_to_ner[idx] for idx in all_ner_targets]

    # Generate classification report
    print("NER Classification Report:")
    print(classification_report(all_ner_targets_filtered, all_ner_preds_filtered, zero_division=0, digits=4))

# Function to display random sentences with true and predicted NER tags
def display_random_samples(model, test_loader, vocab, ner_tag_to_ix, num_samples=5):
    model.eval()
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}
    vocab_inv = {v: k for k, v in vocab.items()}

    # Collect all sentences and their true/predicted tags
    all_sentences = []
    all_true_ner = []
    all_pred_ner = []

    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            # Convert indices to words and tags
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                words = [vocab_inv[idx.item()] for idx in sentences[i][:sentence_length]]
                true_ner = [idx_to_ner[idx.item()] for idx in ner_tags[i][:sentence_length]]
                pred_ner = [idx_to_ner[idx] for idx in ner_preds[i][:sentence_length]]

                all_sentences.append(words)
                all_true_ner.append(true_ner)
                all_pred_ner.append(pred_ner)

    # Randomly select `num_samples` sentences
    random_indices = random.sample(range(len(all_sentences)), num_samples)
    for idx in random_indices:
        print(f"\nSample {idx + 1}:")
        print("Sentence:    ", " ".join(all_sentences[idx]))
        print("True NER:    ", " ".join(all_true_ner[idx]))
        print("Predicted NER:", " ".join(all_pred_ner[idx]))

# Train the model and evaluate on the test set
train_model(model, train_loader, val_loader, test_loader, epochs=30)

# Display 5 random samples from the test set
display_random_samples(model, test_loader, vocab, ner_tag_to_ix, num_samples=5)

Loaded 341179 words from FastText binary model.
Epoch 1/30: Train Loss = 128557.3785, Val Loss = 6385.1956
Epoch 2/30: Train Loss = 46451.4199, Val Loss = 5261.7667
Epoch 3/30: Train Loss = 36191.9777, Val Loss = 3552.7402
Epoch 4/30: Train Loss = 31046.8064, Val Loss = 3619.1624
Epoch 5/30: Train Loss = 27484.7393, Val Loss = 2780.2071
Epoch 6/30: Train Loss = 25561.2284, Val Loss = 2807.4914
Epoch 7/30: Train Loss = 23871.3128, Val Loss = 2489.1845
Epoch 8/30: Train Loss = 21958.7037, Val Loss = 2266.7023
Epoch 9/30: Train Loss = 20439.4832, Val Loss = 2206.0556
Epoch 10/30: Train Loss = 19394.0990, Val Loss = 2064.2375
Epoch 11/30: Train Loss = 18439.5933, Val Loss = 2156.3890
Epoch 12/30: Train Loss = 17787.5161, Val Loss = 1975.4298
Epoch 13/30: Train Loss = 17017.7447, Val Loss = 1882.4904
Epoch 14/30: Train Loss = 16050.4395, Val Loss = 1845.0240
Epoch 15/30: Train Loss = 15350.2478, Val Loss = 1766.6904
Epoch 16/30: Train Loss = 15010.3573, Val Loss = 1788.3697
Epoch 17/30: Tra

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.7887    0.8485    0.8175        66
       B-LOC     0.9793    0.9619    0.9706      1182
       B-NUM     1.0000    0.1333    0.2353        15
       B-ORG     0.5000    0.2917    0.3684        48
       B-PER     0.8182    0.7941    0.8060        34
      B-TIME     0.8889    0.8889    0.8889         9
      E-DATE     0.7606    0.8182    0.7883        66
       E-LOC     0.9793    0.9619    0.9706      1182
       E-NUM     1.0000    0.1333    0.2353        15
       E-ORG     0.5000    0.2917    0.3684        48
       E-PER     0.8182    0.7941    0.8060        34
      E-TIME     0.8889    0.8889    0.8889         9
      I-DATE     0.7037    0.5000    0.5846        38
       I-LOC     0.9918    0.9642    0.9778       503
       I-ORG     0.3667    0.2821    0.3188        39
           O     0.9836    0.9914    0.9875     21324
      S-DATE     1.0000    0.8523    0.9202        88


In [5]:
#embedding300/hiddendim128/batch32 (Single NER)
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"  # Replace with your .bin file path
fasttext_model = load_facebook_model(fasttext_bin_file)

# Extract the word vectors
fasttext_vectors = fasttext_model.wv
print(f"Loaded {len(fasttext_vectors)} words from FastText binary model.")

# Define Dataset Class (NER-only)
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, ner_tags = [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, ner_tag = [], []
            for line in f:
                if line.strip():
                    word, _, ner = line.strip().split("\t")  # Ignore POS tags
                    sentence.append(word)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        ner_tags.append(ner_tag)
                    sentence, ner_tag = [], []
            if sentence:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
        return sentences, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.ner_tags[idx]

# Collate function for dynamic padding (NER-only)
def collate_fn(batch):
    sentences, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    ner_tensors = []

    for s, n in zip(sentences, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model for NER-only
class BiLSTMCRF_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings):
        super(BiLSTMCRF_NER, self).__init__()
        # Initialize embedding layer with FastText embeddings (frozen)
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=True)  # Freeze embeddings
        self.dropout = nn.Dropout(0.5)  # Add dropout
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)  # Apply dropout
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        ner_logits = self.ner_fc(lstm_out)
        return ner_logits

    def compute_loss(self, x, ner_tags):
        ner_logits = self.forward(x)
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return ner_loss

    def decode(self, x):
        ner_logits = self.forward(x)
        ner_tags = self.ner_crf.decode(ner_logits)
        return ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
ner_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to NER tags

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, ner_tags in zip(dataset.sentences, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300  # FastText embedding dimension
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))  # Initialize with zeros

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])  # Use full 300 dimensions
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)  # Zero vector for padding
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)  # Random vector for unknown words

# Initialize model
hidden_dim = 128
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF_NER(vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Add weight decay for L2 regularization

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Training loop with early stopping
def train_model(model, train_loader, val_loader, test_loader, epochs):
    best_val_loss = float('inf')
    patience = 3
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, ner_tags in train_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, ner_tags)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, ner_tags in val_loader:
                sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, ner_tags).item()

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

    # Evaluate on test set after training
    model.eval()
    all_ner_preds, all_ner_targets = [], []
    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            # Flatten the predictions and targets, excluding <PAD> tokens
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                all_ner_preds.extend(ner_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_ner_targets.extend(ner_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length

    # Convert predictions and targets to tag names
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    # Filter out padding tokens from predictions and targets
    all_ner_preds_filtered = [idx_to_ner[idx] for idx in all_ner_preds]
    all_ner_targets_filtered = [idx_to_ner[idx] for idx in all_ner_targets]

    # Generate classification report
    print("NER Classification Report:")
    print(classification_report(all_ner_targets_filtered, all_ner_preds_filtered, zero_division=0, digits=4))

# Function to display random sentences with true and predicted NER tags
def display_random_samples(model, test_loader, vocab, ner_tag_to_ix, num_samples=5):
    model.eval()
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}
    vocab_inv = {v: k for k, v in vocab.items()}

    # Collect all sentences and their true/predicted tags
    all_sentences = []
    all_true_ner = []
    all_pred_ner = []

    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            # Convert indices to words and tags
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                words = [vocab_inv[idx.item()] for idx in sentences[i][:sentence_length]]
                true_ner = [idx_to_ner[idx.item()] for idx in ner_tags[i][:sentence_length]]
                pred_ner = [idx_to_ner[idx] for idx in ner_preds[i][:sentence_length]]

                all_sentences.append(words)
                all_true_ner.append(true_ner)
                all_pred_ner.append(pred_ner)

    # Randomly select `num_samples` sentences
    random_indices = random.sample(range(len(all_sentences)), num_samples)
    for idx in random_indices:
        print(f"\nSample {idx + 1}:")
        print("Sentence:    ", " ".join(all_sentences[idx]))
        print("True NER:    ", " ".join(all_true_ner[idx]))
        print("Predicted NER:", " ".join(all_pred_ner[idx]))

# Train the model and evaluate on the test set
train_model(model, train_loader, val_loader, test_loader, epochs=30)

# Display 5 random samples from the test set
display_random_samples(model, test_loader, vocab, ner_tag_to_ix, num_samples=5)

Loaded 341179 words from FastText binary model.
Epoch 1/30: Train Loss = 113844.7515, Val Loss = 5773.6969
Epoch 2/30: Train Loss = 43958.6649, Val Loss = 4143.4077
Epoch 3/30: Train Loss = 34416.4559, Val Loss = 3327.3332
Epoch 4/30: Train Loss = 29143.5760, Val Loss = 2860.9609
Epoch 5/30: Train Loss = 25652.2860, Val Loss = 2532.6725
Epoch 6/30: Train Loss = 23208.4379, Val Loss = 2275.7706
Epoch 7/30: Train Loss = 21526.7472, Val Loss = 2125.9115
Epoch 8/30: Train Loss = 19911.2338, Val Loss = 2011.8973
Epoch 9/30: Train Loss = 18970.7070, Val Loss = 1924.4657
Epoch 10/30: Train Loss = 17862.6278, Val Loss = 1807.8838
Epoch 11/30: Train Loss = 17252.2050, Val Loss = 1774.6386
Epoch 12/30: Train Loss = 16479.5349, Val Loss = 1698.1722
Epoch 13/30: Train Loss = 15917.6387, Val Loss = 1700.0676
Epoch 14/30: Train Loss = 14929.2227, Val Loss = 1634.9834
Epoch 15/30: Train Loss = 14547.7274, Val Loss = 1637.9751
Epoch 16/30: Train Loss = 14084.8317, Val Loss = 1565.4626
Epoch 17/30: Tra

In [3]:
#embedding300/hiddendim128/batch32(Joint_Model)
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"  
fasttext_model = load_facebook_model(fasttext_bin_file)

# Extract the word vectors
fasttext_vectors = fasttext_model.wv
print(f"Loaded {len(fasttext_vectors)} words from FastText binary model.")

# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, pos_tags, ner_tags = [], [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, pos_tag, ner_tag = [], [], []
            for line in f:
                if line.strip():
                    word, pos, ner = line.strip().split("\t")
                    sentence.append(word)
                    pos_tag.append(pos)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_tag)
                        ner_tags.append(ner_tag)
                    sentence, pos_tag, ner_tag = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_tag)
                ner_tags.append(ner_tag)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.pos_tags[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    pos_tensors = []
    ner_tensors = []

    for s, p, n in zip(sentences, pos_tags, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_pos = p + ["<PAD>"] * (max_len - len(p))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        pos_tensors.append(torch.tensor([pos_tag_to_ix[tag] for tag in padded_pos], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(pos_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model with Frozen FastText Embeddings
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings):
        super(BiLSTMCRF, self).__init__()
        # Initialize embedding layer with FastText embeddings (frozen)
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=True)  # Freeze embeddings
        self.dropout = nn.Dropout(0.5)  # Add dropout
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.pos_fc = nn.Linear(hidden_dim * 2, num_pos_tags)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.pos_crf = CRF(num_pos_tags, batch_first=True)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)  # Apply dropout
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        pos_logits = self.pos_fc(lstm_out)
        ner_logits = self.ner_fc(lstm_out)
        return pos_logits, ner_logits

    def compute_loss(self, x, pos_tags, ner_tags, alpha=0.5):
        pos_logits, ner_logits = self.forward(x)
        pos_loss = -self.pos_crf(pos_logits, pos_tags, mask=(x != vocab["<PAD>"]))
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return alpha * pos_loss + (1 - alpha) * ner_loss

    def decode(self, x):
        pos_logits, ner_logits = self.forward(x)
        pos_tags = self.pos_crf.decode(pos_logits)
        ner_tags = self.ner_crf.decode(ner_logits)
        return pos_tags, ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to POS tags
ner_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to NER tags

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300  # FastText embedding dimension
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))  # Initialize with zeros

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])  # Use full 300 dimensions
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)  # Zero vector for padding
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)  # Random vector for unknown words

# Initialize model
hidden_dim = 128  
vocab_size = len(vocab)
num_pos_tags = len(pos_tag_to_ix)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF(vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Add weight decay for L2 regularization

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Training loop with early stopping
def train_model(model, train_loader, val_loader, test_loader, epochs):
    best_val_loss = float('inf')
    patience = 10
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, pos_tags, ner_tags in train_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, pos_tags, ner_tags, alpha=0.5)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, pos_tags, ner_tags in val_loader:
                sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, pos_tags, ner_tags).item()

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

    # Evaluate on test set after training
    model.eval()
    all_pos_preds, all_pos_targets, all_ner_preds, all_ner_targets = [], [], [], []
    with torch.no_grad():
        for sentences, pos_tags, ner_tags in test_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            pos_preds, ner_preds = model.decode(sentences)

            # Flatten the predictions and targets, excluding <PAD> tokens
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                all_pos_preds.extend(pos_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_pos_targets.extend(pos_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length
                all_ner_preds.extend(ner_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_ner_targets.extend(ner_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length

    # Convert predictions and targets to tag names
    idx_to_pos = {v: k for k, v in pos_tag_to_ix.items()}
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    # Filter out padding tokens from predictions and targets
    all_pos_preds_filtered = [idx_to_pos[idx] for idx in all_pos_preds]
    all_pos_targets_filtered = [idx_to_pos[idx] for idx in all_pos_targets]
    all_ner_preds_filtered = [idx_to_ner[idx] for idx in all_ner_preds]
    all_ner_targets_filtered = [idx_to_ner[idx] for idx in all_ner_targets]

    # Generate classification reports
    print("POS Classification Report:")
    print(classification_report(all_pos_targets_filtered, all_pos_preds_filtered, zero_division=0,digits=4))

    print("NER Classification Report:")
    print(classification_report(all_ner_targets_filtered, all_ner_preds_filtered, zero_division=0,digits=4))

# Train the model and evaluate on the test set
train_model(model, train_loader, val_loader, test_loader, epochs=30)

Loaded 341179 words from FastText binary model.
Epoch 1/30: Train Loss = 169418.3035, Val Loss = 10607.5950
Epoch 2/30: Train Loss = 82694.9764, Val Loss = 7511.4112
Epoch 3/30: Train Loss = 67745.3278, Val Loss = 6173.5165
Epoch 4/30: Train Loss = 60399.0341, Val Loss = 5515.0616
Epoch 5/30: Train Loss = 54845.8512, Val Loss = 5047.3391
Epoch 6/30: Train Loss = 50815.8288, Val Loss = 4643.5349
Epoch 7/30: Train Loss = 48205.2372, Val Loss = 4397.6679
Epoch 8/30: Train Loss = 45515.3411, Val Loss = 4165.3558
Epoch 9/30: Train Loss = 43414.4916, Val Loss = 3932.5429
Epoch 10/30: Train Loss = 41382.9853, Val Loss = 3770.7794
Epoch 11/30: Train Loss = 39823.3249, Val Loss = 3625.9923
Epoch 12/30: Train Loss = 38628.3483, Val Loss = 3537.8195
Epoch 13/30: Train Loss = 37081.8537, Val Loss = 3424.8073
Epoch 14/30: Train Loss = 36049.0337, Val Loss = 3318.6395
Epoch 15/30: Train Loss = 34948.4065, Val Loss = 3257.7700
Epoch 16/30: Train Loss = 34094.1119, Val Loss = 3175.0503
Epoch 17/30: Tr

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


POS Classification Report:
              precision    recall  f1-score   support

         abb     0.0000    0.0000    0.0000        18
         adj     0.8615    0.6450    0.7377       569
         adv     0.8093    0.5365    0.6453       356
        conj     0.9057    0.8972    0.9014       739
          fw     0.8929    0.7692    0.8264        65
         int     0.7857    0.6471    0.7097        17
           n     0.9313    0.9747    0.9525      7694
         num     0.9953    0.9906    0.9930       641
        part     0.9585    0.9635    0.9610      4461
         ppm     0.9861    0.9861    0.9861      4114
        pron     0.9551    0.8651    0.9079       467
        punc     1.0000    0.9966    0.9983      2919
          sb     1.0000    0.9231    0.9600        13
          tn     0.9387    0.9107    0.9245       168
           v     0.9306    0.9255    0.9280      3302

    accuracy                         0.9515     25543
   macro avg     0.8634    0.8021    0.8288     25543

In [8]:
#embedding300/hiddendim256/batch64 (Best REsults_joint_model)
import time
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"  
fasttext_model = load_facebook_model(fasttext_bin_file)

# Extract the word vectors
fasttext_vectors = fasttext_model.wv
print(f"Loaded {len(fasttext_vectors)} words from FastText binary model.")

# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, pos_tags, ner_tags = [], [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, pos_tag, ner_tag = [], [], []
            for line in f:
                if line.strip():
                    word, pos, ner = line.strip().split("\t")
                    sentence.append(word)
                    pos_tag.append(pos)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_tag)
                        ner_tags.append(ner_tag)
                    sentence, pos_tag, ner_tag = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_tag)
                ner_tags.append(ner_tag)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.pos_tags[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    pos_tensors = []
    ner_tensors = []

    for s, p, n in zip(sentences, pos_tags, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_pos = p + ["<PAD>"] * (max_len - len(p))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        pos_tensors.append(torch.tensor([pos_tag_to_ix[tag] for tag in padded_pos], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(pos_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model with Frozen FastText Embeddings
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings):
        super(BiLSTMCRF, self).__init__()
        # Initialize embedding layer with FastText embeddings (frozen)
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=True)  # Freeze embeddings
        self.dropout = nn.Dropout(0.5)  # Add dropout
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.pos_fc = nn.Linear(hidden_dim * 2, num_pos_tags)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.pos_crf = CRF(num_pos_tags, batch_first=True)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)  # Apply dropout
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        pos_logits = self.pos_fc(lstm_out)
        ner_logits = self.ner_fc(lstm_out)
        return pos_logits, ner_logits

    def compute_loss(self, x, pos_tags, ner_tags, alpha=0.5):
        pos_logits, ner_logits = self.forward(x)
        pos_loss = -self.pos_crf(pos_logits, pos_tags, mask=(x != vocab["<PAD>"]))
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return alpha * pos_loss + (1 - alpha) * ner_loss

    def decode(self, x):
        pos_logits, ner_logits = self.forward(x)
        pos_tags = self.pos_crf.decode(pos_logits)
        ner_tags = self.ner_crf.decode(ner_logits)
        return pos_tags, ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to POS tags
ner_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to NER tags

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300  # FastText embedding dimension
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))  # Initialize with zeros

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])  # Use full 300 dimensions
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)  # Zero vector for padding
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)  # Random vector for unknown words

# Initialize model
hidden_dim = 256
vocab_size = len(vocab)
num_pos_tags = len(pos_tag_to_ix)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF(vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Add weight decay for L2 regularization

# Create data loaders with batch size 64
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Training loop with early stopping and training time measurement
def train_model(model, train_loader, val_loader, test_loader, epochs):
    best_val_loss = float('inf')
    patience = 3
    epochs_without_improvement = 0

    start_time = time.time() 

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, pos_tags, ner_tags in train_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, pos_tags, ner_tags, alpha=0.5)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, pos_tags, ner_tags in val_loader:
                sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, pos_tags, ner_tags).item()

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

    end_time = time.time()  # End measuring training time
    training_time = end_time - start_time  # Calculate total training time
    print(f"Total training time: {training_time:.2f} seconds")

    # Evaluate on test set after training
    model.eval()
    all_pos_preds, all_pos_targets, all_ner_preds, all_ner_targets = [], [], [], []
    with torch.no_grad():
        for sentences, pos_tags, ner_tags in test_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            pos_preds, ner_preds = model.decode(sentences)

            # Flatten the predictions and targets, excluding <PAD> tokens
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                all_pos_preds.extend(pos_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_pos_targets.extend(pos_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length
                all_ner_preds.extend(ner_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_ner_targets.extend(ner_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length

    # Convert predictions and targets to tag names
    idx_to_pos = {v: k for k, v in pos_tag_to_ix.items()}
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    # Filter out padding tokens from predictions and targets
    all_pos_preds_filtered = [idx_to_pos[idx] for idx in all_pos_preds]
    all_pos_targets_filtered = [idx_to_pos[idx] for idx in all_pos_targets]
    all_ner_preds_filtered = [idx_to_ner[idx] for idx in all_ner_preds]
    all_ner_targets_filtered = [idx_to_ner[idx] for idx in all_ner_targets]

    # Generate classification reports with 4-digit precision
    print("POS Classification Report:")
    print(classification_report(all_pos_targets_filtered, all_pos_preds_filtered, zero_division=0, digits=4))

    print("NER Classification Report:")
    print(classification_report(all_ner_targets_filtered, all_ner_preds_filtered, zero_division=0, digits=4))

# Train the model and evaluate on the test set
train_model(model, train_loader, val_loader, test_loader, epochs=30)

Loaded 341179 words from FastText binary model.
Epoch 1/30: Train Loss = 189753.5316, Val Loss = 12197.6097
Epoch 2/30: Train Loss = 90291.5476, Val Loss = 8242.5269
Epoch 3/30: Train Loss = 72547.8926, Val Loss = 6888.6232
Epoch 4/30: Train Loss = 64029.1100, Val Loss = 6070.9285
Epoch 5/30: Train Loss = 58769.2189, Val Loss = 5614.3379
Epoch 6/30: Train Loss = 54536.0291, Val Loss = 5305.4430
Epoch 7/30: Train Loss = 51391.5148, Val Loss = 4844.2746
Epoch 8/30: Train Loss = 48234.5868, Val Loss = 4592.1749
Epoch 9/30: Train Loss = 45719.6047, Val Loss = 4316.2368
Epoch 10/30: Train Loss = 43768.6254, Val Loss = 4184.5578
Epoch 11/30: Train Loss = 41894.0872, Val Loss = 4009.8388
Epoch 12/30: Train Loss = 40241.6173, Val Loss = 3840.2436
Epoch 13/30: Train Loss = 38795.6116, Val Loss = 3687.9126
Epoch 14/30: Train Loss = 37579.8927, Val Loss = 3629.5074
Epoch 15/30: Train Loss = 36288.8907, Val Loss = 3450.6194
Epoch 16/30: Train Loss = 34845.4041, Val Loss = 3399.1958
Epoch 17/30: Tr