In [1]:
!pip install git+https://github.com/kmkurn/pytorch-crf.git

Collecting git+https://github.com/kmkurn/pytorch-crf.git
  Cloning https://github.com/kmkurn/pytorch-crf.git to /tmp/pip-req-build-llv3qnsv
  Running command git clone --filter=blob:none --quiet https://github.com/kmkurn/pytorch-crf.git /tmp/pip-req-build-llv3qnsv
  Resolved https://github.com/kmkurn/pytorch-crf.git to commit 623e3402d00a2728e99d6e8486010d67c754267b
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytorch-crf
  Building wheel for pytorch-crf (setup.py) ... [?25l[?25hdone
  Created wheel for pytorch-crf: filename=pytorch_crf-0.7.2-py3-none-any.whl size=6410 sha256=46e4a4f1fea294db3737617d29eb14fa7f4955e61bf6ae0ec8522c393996b226
  Stored in directory: /tmp/pip-ephem-wheel-cache-2m_7rwps/wheels/39/5f/f6/4b48b35895d914f4f5fff5b600f87658c11693e37b6a4f118e
Successfully built pytorch-crf
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [None]:
# Experiment: Named Entity Recognition (NER) using BiLSTM-CRF with FastText Embeddings(Unfrozen)
# Configuration Settings:
# - Embedding Dimension: 300 (FastText)
# - Hidden Dimension: 256/128 (BiLSTM)
# - Batch Size: 64/32
# - Model: BiLSTM-CRF (Single NER MOdel/Joint Model for POS and NER)
# - Optimizer: Adam with L2 regularization (weight_decay=1e-5)
# - Learning Rate: 0.001
# - Dropout: 0.5
# - Early Stopping: Patience of 3 epochs
# - Training Epochs: 20 up to 30
# - Dataset: Pre-split CoNLL format (train_v5.conll, val_v5.conll, test_v5.conll)
# - FastText Model: cc.my.300.bin (pre-trained FastText embeddings)
# - Evaluation Metrics: Precision, Recall, F1-Score for both POS and NER tasks
# - Hardware: NVIDIA Tesla T4 GPU

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchcrf import CRF
from sklearn.metrics import classification_report
from gensim.models.fasttext import load_facebook_model
import random

In [3]:
#embedding300/hiddendim256/batch64(Single NER Model)
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"  
fasttext_model = load_facebook_model(fasttext_bin_file)

# Extract the word vectors
fasttext_vectors = fasttext_model.wv
print(f"Loaded {len(fasttext_vectors)} words from FastText binary model.")

# Define Dataset Class (NER-only)
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, ner_tags = [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, ner_tag = [], []
            for line in f:
                if line.strip():
                    word, _, ner = line.strip().split("\t")  # Ignore POS tags
                    sentence.append(word)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        ner_tags.append(ner_tag)
                    sentence, ner_tag = [], []
            if sentence:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
        return sentences, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.ner_tags[idx]

# Collate function for dynamic padding (NER-only)
def collate_fn(batch):
    sentences, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    ner_tensors = []

    for s, n in zip(sentences, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model for NER-only
class BiLSTMCRF_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings):
        super(BiLSTMCRF_NER, self).__init__()
        # Initialize embedding layer with FastText embeddings (frozen)
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=False)  # Freeze embeddings
        self.dropout = nn.Dropout(0.5)  # Add dropout
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)  # Apply dropout
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        ner_logits = self.ner_fc(lstm_out)
        return ner_logits

    def compute_loss(self, x, ner_tags):
        ner_logits = self.forward(x)
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return ner_loss

    def decode(self, x):
        ner_logits = self.forward(x)
        ner_tags = self.ner_crf.decode(ner_logits)
        return ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
ner_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to NER tags

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, ner_tags in zip(dataset.sentences, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300  # FastText embedding dimension
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))  # Initialize with zeros

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])  # Use full 300 dimensions
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)  # Zero vector for padding
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)  # Random vector for unknown words

# Initialize model
hidden_dim = 256
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF_NER(vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Add weight decay for L2 regularization

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Training loop with early stopping
def train_model(model, train_loader, val_loader, test_loader, epochs):
    best_val_loss = float('inf')
    patience = 3
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, ner_tags in train_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, ner_tags)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, ner_tags in val_loader:
                sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, ner_tags).item()

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

    # Evaluate on test set after training
    model.eval()
    all_ner_preds, all_ner_targets = [], []
    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            # Flatten the predictions and targets, excluding <PAD> tokens
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                all_ner_preds.extend(ner_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_ner_targets.extend(ner_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length

    # Convert predictions and targets to tag names
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    # Filter out padding tokens from predictions and targets
    all_ner_preds_filtered = [idx_to_ner[idx] for idx in all_ner_preds]
    all_ner_targets_filtered = [idx_to_ner[idx] for idx in all_ner_targets]

    # Generate classification report
    print("NER Classification Report:")
    print(classification_report(all_ner_targets_filtered, all_ner_preds_filtered, zero_division=0, digits=4))

# Function to display random sentences with true and predicted NER tags
def display_random_samples(model, test_loader, vocab, ner_tag_to_ix, num_samples=5):
    model.eval()
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}
    vocab_inv = {v: k for k, v in vocab.items()}

    # Collect all sentences and their true/predicted tags
    all_sentences = []
    all_true_ner = []
    all_pred_ner = []

    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            # Convert indices to words and tags
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                words = [vocab_inv[idx.item()] for idx in sentences[i][:sentence_length]]
                true_ner = [idx_to_ner[idx.item()] for idx in ner_tags[i][:sentence_length]]
                pred_ner = [idx_to_ner[idx] for idx in ner_preds[i][:sentence_length]]

                all_sentences.append(words)
                all_true_ner.append(true_ner)
                all_pred_ner.append(pred_ner)

    # Randomly select `num_samples` sentences
    random_indices = random.sample(range(len(all_sentences)), num_samples)
    for idx in random_indices:
        print(f"\nSample {idx + 1}:")
        print("Sentence:    ", " ".join(all_sentences[idx]))
        print("True NER:    ", " ".join(all_true_ner[idx]))
        print("Predicted NER:", " ".join(all_pred_ner[idx]))

# Train the model and evaluate on the test set
train_model(model, train_loader, val_loader, test_loader, epochs=20)

# Display 5 random samples from the test set
display_random_samples(model, test_loader, vocab, ner_tag_to_ix, num_samples=5)

Loaded 341179 words from FastText binary model.
Epoch 1/20: Train Loss = 107106.1570, Val Loss = 4207.9764
Epoch 2/20: Train Loss = 26054.9429, Val Loss = 2864.9137
Epoch 3/20: Train Loss = 16470.2239, Val Loss = 2481.7054
Epoch 4/20: Train Loss = 11656.9056, Val Loss = 2276.5809
Epoch 5/20: Train Loss = 8383.0836, Val Loss = 2156.6266
Epoch 6/20: Train Loss = 6405.9056, Val Loss = 2255.2282
Epoch 7/20: Train Loss = 5265.9274, Val Loss = 2252.1882
Epoch 8/20: Train Loss = 4318.5959, Val Loss = 2280.8368
Early stopping triggered!


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.8462    0.8333    0.8397        66
       B-LOC     0.9583    0.9729    0.9656      1182
       B-NUM     0.3571    0.3333    0.3448        15
       B-ORG     0.5357    0.6250    0.5769        48
       B-PER     0.7750    0.9118    0.8378        34
      B-TIME     0.7778    0.7778    0.7778         9
      E-DATE     0.9032    0.8485    0.8750        66
       E-LOC     0.9729    0.9712    0.9721      1182
       E-NUM     0.3750    0.4000    0.3871        15
       E-ORG     0.5357    0.6250    0.5769        48
       E-PER     0.7317    0.8824    0.8000        34
      E-TIME     0.7778    0.7778    0.7778         9
      I-DATE     0.7826    0.9474    0.8571        38
       I-LOC     0.9624    0.9662    0.9643       503
       I-ORG     0.6000    0.4615    0.5217        39
           O     0.9884    0.9890    0.9887     21324
      S-DATE     0.9868    0.8523    0.9146        88


In [5]:
#embedding300/hiddendim128/batch32 (Single NER)
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"  # Replace with your .bin file path
fasttext_model = load_facebook_model(fasttext_bin_file)

# Extract the word vectors
fasttext_vectors = fasttext_model.wv
print(f"Loaded {len(fasttext_vectors)} words from FastText binary model.")

# Define Dataset Class (NER-only)
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, ner_tags = [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, ner_tag = [], []
            for line in f:
                if line.strip():
                    word, _, ner = line.strip().split("\t")  # Ignore POS tags
                    sentence.append(word)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        ner_tags.append(ner_tag)
                    sentence, ner_tag = [], []
            if sentence:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
        return sentences, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.ner_tags[idx]

# Collate function for dynamic padding (NER-only)
def collate_fn(batch):
    sentences, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    ner_tensors = []

    for s, n in zip(sentences, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model for NER-only
class BiLSTMCRF_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings):
        super(BiLSTMCRF_NER, self).__init__()
        # Initialize embedding layer with FastText embeddings (frozen)
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=False)  # Freeze embeddings
        self.dropout = nn.Dropout(0.5)  # Add dropout
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)  # Apply dropout
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        ner_logits = self.ner_fc(lstm_out)
        return ner_logits

    def compute_loss(self, x, ner_tags):
        ner_logits = self.forward(x)
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return ner_loss

    def decode(self, x):
        ner_logits = self.forward(x)
        ner_tags = self.ner_crf.decode(ner_logits)
        return ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
ner_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to NER tags

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, ner_tags in zip(dataset.sentences, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300  # FastText embedding dimension
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))  # Initialize with zeros

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])  # Use full 300 dimensions
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)  # Zero vector for padding
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)  # Random vector for unknown words

# Initialize model
hidden_dim = 128
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF_NER(vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Add weight decay for L2 regularization

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Training loop with early stopping
def train_model(model, train_loader, val_loader, test_loader, epochs):
    best_val_loss = float('inf')
    patience = 3
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, ner_tags in train_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, ner_tags)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, ner_tags in val_loader:
                sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, ner_tags).item()

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

    # Evaluate on test set after training
    model.eval()
    all_ner_preds, all_ner_targets = [], []
    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            # Flatten the predictions and targets, excluding <PAD> tokens
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                all_ner_preds.extend(ner_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_ner_targets.extend(ner_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length

    # Convert predictions and targets to tag names
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    # Filter out padding tokens from predictions and targets
    all_ner_preds_filtered = [idx_to_ner[idx] for idx in all_ner_preds]
    all_ner_targets_filtered = [idx_to_ner[idx] for idx in all_ner_targets]

    # Generate classification report
    print("NER Classification Report:")
    print(classification_report(all_ner_targets_filtered, all_ner_preds_filtered, zero_division=0, digits=4))

# Function to display random sentences with true and predicted NER tags
def display_random_samples(model, test_loader, vocab, ner_tag_to_ix, num_samples=5):
    model.eval()
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}
    vocab_inv = {v: k for k, v in vocab.items()}

    # Collect all sentences and their true/predicted tags
    all_sentences = []
    all_true_ner = []
    all_pred_ner = []

    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            # Convert indices to words and tags
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                words = [vocab_inv[idx.item()] for idx in sentences[i][:sentence_length]]
                true_ner = [idx_to_ner[idx.item()] for idx in ner_tags[i][:sentence_length]]
                pred_ner = [idx_to_ner[idx] for idx in ner_preds[i][:sentence_length]]

                all_sentences.append(words)
                all_true_ner.append(true_ner)
                all_pred_ner.append(pred_ner)

    # Randomly select `num_samples` sentences
    random_indices = random.sample(range(len(all_sentences)), num_samples)
    for idx in random_indices:
        print(f"\nSample {idx + 1}:")
        print("Sentence:    ", " ".join(all_sentences[idx]))
        print("True NER:    ", " ".join(all_true_ner[idx]))
        print("Predicted NER:", " ".join(all_pred_ner[idx]))

# Train the model and evaluate on the test set
train_model(model, train_loader, val_loader, test_loader, epochs=30)

# Display 5 random samples from the test set
display_random_samples(model, test_loader, vocab, ner_tag_to_ix, num_samples=5)

Loaded 341179 words from FastText binary model.
Epoch 1/30: Train Loss = 79217.3216, Val Loss = 3492.5098
Epoch 2/30: Train Loss = 22002.7420, Val Loss = 2513.7625
Epoch 3/30: Train Loss = 13313.3383, Val Loss = 2305.9815
Epoch 4/30: Train Loss = 8987.3853, Val Loss = 2012.5785
Epoch 5/30: Train Loss = 6550.1772, Val Loss = 2125.9027
Epoch 6/30: Train Loss = 5100.0334, Val Loss = 2082.6652
Epoch 7/30: Train Loss = 4124.5737, Val Loss = 2119.9131
Early stopping triggered!
NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.8667    0.7879    0.8254        66
       B-LOC     0.9779    0.9721    0.9750      1182
       B-NUM     0.3000    0.4000    0.3429        15
       B-ORG     0.6774    0.4375    0.5316        48
       B-PER     0.9394    0.9118    0.9254        34
      B-TIME     0.8750    0.7778    0.8235         9
      E-DATE     0.8689    0.8030    0.8346        66
       E-LOC     0.9795    0.9687    0.9741      1182
       E-N

In [6]:
#embedding300/hiddendim128/batch32(Joint_Model)
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"  
fasttext_model = load_facebook_model(fasttext_bin_file)

# Extract the word vectors
fasttext_vectors = fasttext_model.wv
print(f"Loaded {len(fasttext_vectors)} words from FastText binary model.")

# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, pos_tags, ner_tags = [], [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, pos_tag, ner_tag = [], [], []
            for line in f:
                if line.strip():
                    word, pos, ner = line.strip().split("\t")
                    sentence.append(word)
                    pos_tag.append(pos)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_tag)
                        ner_tags.append(ner_tag)
                    sentence, pos_tag, ner_tag = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_tag)
                ner_tags.append(ner_tag)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.pos_tags[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    pos_tensors = []
    ner_tensors = []

    for s, p, n in zip(sentences, pos_tags, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_pos = p + ["<PAD>"] * (max_len - len(p))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        pos_tensors.append(torch.tensor([pos_tag_to_ix[tag] for tag in padded_pos], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(pos_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model with Frozen FastText Embeddings
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings):
        super(BiLSTMCRF, self).__init__()
        # Initialize embedding layer with FastText embeddings (frozen)
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=False)  # Freeze embeddings
        self.dropout = nn.Dropout(0.5)  # Add dropout
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.pos_fc = nn.Linear(hidden_dim * 2, num_pos_tags)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.pos_crf = CRF(num_pos_tags, batch_first=True)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)  # Apply dropout
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        pos_logits = self.pos_fc(lstm_out)
        ner_logits = self.ner_fc(lstm_out)
        return pos_logits, ner_logits

    def compute_loss(self, x, pos_tags, ner_tags, alpha=0.5):
        pos_logits, ner_logits = self.forward(x)
        pos_loss = -self.pos_crf(pos_logits, pos_tags, mask=(x != vocab["<PAD>"]))
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return alpha * pos_loss + (1 - alpha) * ner_loss

    def decode(self, x):
        pos_logits, ner_logits = self.forward(x)
        pos_tags = self.pos_crf.decode(pos_logits)
        ner_tags = self.ner_crf.decode(ner_logits)
        return pos_tags, ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to POS tags
ner_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to NER tags

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300  # FastText embedding dimension
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))  # Initialize with zeros

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])  # Use full 300 dimensions
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)  # Zero vector for padding
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)  # Random vector for unknown words

# Initialize model
hidden_dim = 128  
vocab_size = len(vocab)
num_pos_tags = len(pos_tag_to_ix)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF(vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Add weight decay for L2 regularization

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Training loop with early stopping
def train_model(model, train_loader, val_loader, test_loader, epochs):
    best_val_loss = float('inf')
    patience = 10
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, pos_tags, ner_tags in train_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, pos_tags, ner_tags, alpha=0.5)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, pos_tags, ner_tags in val_loader:
                sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, pos_tags, ner_tags).item()

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

    # Evaluate on test set after training
    model.eval()
    all_pos_preds, all_pos_targets, all_ner_preds, all_ner_targets = [], [], [], []
    with torch.no_grad():
        for sentences, pos_tags, ner_tags in test_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            pos_preds, ner_preds = model.decode(sentences)

            # Flatten the predictions and targets, excluding <PAD> tokens
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                all_pos_preds.extend(pos_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_pos_targets.extend(pos_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length
                all_ner_preds.extend(ner_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_ner_targets.extend(ner_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length

    # Convert predictions and targets to tag names
    idx_to_pos = {v: k for k, v in pos_tag_to_ix.items()}
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    # Filter out padding tokens from predictions and targets
    all_pos_preds_filtered = [idx_to_pos[idx] for idx in all_pos_preds]
    all_pos_targets_filtered = [idx_to_pos[idx] for idx in all_pos_targets]
    all_ner_preds_filtered = [idx_to_ner[idx] for idx in all_ner_preds]
    all_ner_targets_filtered = [idx_to_ner[idx] for idx in all_ner_targets]

    # Generate classification reports
    print("POS Classification Report:")
    print(classification_report(all_pos_targets_filtered, all_pos_preds_filtered, zero_division=0))

    print("NER Classification Report:")
    print(classification_report(all_ner_targets_filtered, all_ner_preds_filtered, zero_division=0))

# Train the model and evaluate on the test set
train_model(model, train_loader, val_loader, test_loader, epochs=30)

Loaded 341179 words from FastText binary model.
Epoch 1/30: Train Loss = 113667.3239, Val Loss = 4140.6803
Epoch 2/30: Train Loss = 26897.9645, Val Loss = 2721.7103
Epoch 3/30: Train Loss = 16614.9794, Val Loss = 2285.2573
Epoch 4/30: Train Loss = 12265.3234, Val Loss = 2209.2708
Epoch 5/30: Train Loss = 9739.6690, Val Loss = 2204.4113
Epoch 6/30: Train Loss = 8198.7894, Val Loss = 2065.8556
Epoch 7/30: Train Loss = 7078.0542, Val Loss = 2162.6499
Epoch 8/30: Train Loss = 6150.7442, Val Loss = 2250.2693
Epoch 9/30: Train Loss = 5564.5062, Val Loss = 2137.7373
Epoch 10/30: Train Loss = 4969.9363, Val Loss = 2162.1399
Epoch 11/30: Train Loss = 4453.3281, Val Loss = 2257.2618
Epoch 12/30: Train Loss = 4076.7058, Val Loss = 2268.3211
Epoch 13/30: Train Loss = 3557.5349, Val Loss = 2270.6549
Epoch 14/30: Train Loss = 3309.6896, Val Loss = 2425.7447
Epoch 15/30: Train Loss = 3134.9647, Val Loss = 2425.7703
Epoch 16/30: Train Loss = 2847.1326, Val Loss = 2445.4997
Early stopping triggered!
PO

In [9]:
#embedding300/hiddendim256/batch64 (Best REsults_joint_model)

# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"  
fasttext_model = load_facebook_model(fasttext_bin_file)

# Extract the word vectors
fasttext_vectors = fasttext_model.wv
print(f"Loaded {len(fasttext_vectors)} words from FastText binary model.")

# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, pos_tags, ner_tags = [], [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, pos_tag, ner_tag = [], [], []
            for line in f:
                if line.strip():
                    word, pos, ner = line.strip().split("\t")
                    sentence.append(word)
                    pos_tag.append(pos)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_tag)
                        ner_tags.append(ner_tag)
                    sentence, pos_tag, ner_tag = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_tag)
                ner_tags.append(ner_tag)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.pos_tags[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    pos_tensors = []
    ner_tensors = []

    for s, p, n in zip(sentences, pos_tags, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_pos = p + ["<PAD>"] * (max_len - len(p))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        pos_tensors.append(torch.tensor([pos_tag_to_ix[tag] for tag in padded_pos], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(pos_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model with Frozen FastText Embeddings
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings):
        super(BiLSTMCRF, self).__init__()
        # Initialize embedding layer with FastText embeddings (frozen)
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=False)  # Freeze embeddings
        self.dropout = nn.Dropout(0.5)  # Add dropout
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.pos_fc = nn.Linear(hidden_dim * 2, num_pos_tags)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.pos_crf = CRF(num_pos_tags, batch_first=True)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)  # Apply dropout
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        pos_logits = self.pos_fc(lstm_out)
        ner_logits = self.ner_fc(lstm_out)
        return pos_logits, ner_logits

    def compute_loss(self, x, pos_tags, ner_tags, alpha=0.5):
        pos_logits, ner_logits = self.forward(x)
        pos_loss = -self.pos_crf(pos_logits, pos_tags, mask=(x != vocab["<PAD>"]))
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return alpha * pos_loss + (1 - alpha) * ner_loss

    def decode(self, x):
        pos_logits, ner_logits = self.forward(x)
        pos_tags = self.pos_crf.decode(pos_logits)
        ner_tags = self.ner_crf.decode(ner_logits)
        return pos_tags, ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to POS tags
ner_tag_to_ix = {"<PAD>": 0}  # Add <PAD> to NER tags

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300  # FastText embedding dimension
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))  # Initialize with zeros

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])  # Use full 300 dimensions
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)  # Zero vector for padding
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)  # Random vector for unknown words

# Initialize model
hidden_dim = 256
vocab_size = len(vocab)
num_pos_tags = len(pos_tag_to_ix)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF(vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Add weight decay for L2 regularization

# Create data loaders with batch size 64
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Training loop with early stopping and training time measurement
def train_model(model, train_loader, val_loader, test_loader, epochs):
    best_val_loss = float('inf')
    patience = 3
    epochs_without_improvement = 0

    start_time = time.time() 

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, pos_tags, ner_tags in train_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, pos_tags, ner_tags, alpha=0.5)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, pos_tags, ner_tags in val_loader:
                sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, pos_tags, ner_tags).item()

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

    end_time = time.time()  # End measuring training time
    training_time = end_time - start_time  # Calculate total training time
    print(f"Total training time: {training_time:.2f} seconds")

    # Evaluate on test set after training
    model.eval()
    all_pos_preds, all_pos_targets, all_ner_preds, all_ner_targets = [], [], [], []
    with torch.no_grad():
        for sentences, pos_tags, ner_tags in test_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            pos_preds, ner_preds = model.decode(sentences)

            # Flatten the predictions and targets, excluding <PAD> tokens
            for i in range(len(sentences)):
                sentence_length = (sentences[i] != vocab["<PAD>"]).sum().item()  # Length of the actual sentence
                all_pos_preds.extend(pos_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_pos_targets.extend(pos_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length
                all_ner_preds.extend(ner_preds[i][:sentence_length])  # Truncate predictions to sentence length
                all_ner_targets.extend(ner_tags[i][:sentence_length].cpu().numpy())  # Truncate targets to sentence length

    # Convert predictions and targets to tag names
    idx_to_pos = {v: k for k, v in pos_tag_to_ix.items()}
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    # Filter out padding tokens from predictions and targets
    all_pos_preds_filtered = [idx_to_pos[idx] for idx in all_pos_preds]
    all_pos_targets_filtered = [idx_to_pos[idx] for idx in all_pos_targets]
    all_ner_preds_filtered = [idx_to_ner[idx] for idx in all_ner_preds]
    all_ner_targets_filtered = [idx_to_ner[idx] for idx in all_ner_targets]

    # Generate classification reports with 4-digit precision
    print("POS Classification Report:")
    print(classification_report(all_pos_targets_filtered, all_pos_preds_filtered, zero_division=0, digits=4))

    print("NER Classification Report:")
    print(classification_report(all_ner_targets_filtered, all_ner_preds_filtered, zero_division=0, digits=4))

# Train the model and evaluate on the test set
train_model(model, train_loader, val_loader, test_loader, epochs=20)

Loaded 341179 words from FastText binary model.
Epoch 1/20: Train Loss = 141955.4076, Val Loss = 5151.7805
Epoch 2/20: Train Loss = 31547.4353, Val Loss = 3060.4965
Epoch 3/20: Train Loss = 19098.5350, Val Loss = 2635.6923
Epoch 4/20: Train Loss = 13992.9443, Val Loss = 2305.3816
Epoch 5/20: Train Loss = 11195.3423, Val Loss = 2190.1235
Epoch 6/20: Train Loss = 9333.1118, Val Loss = 2170.8830
Epoch 7/20: Train Loss = 7912.4088, Val Loss = 2151.3330
Epoch 8/20: Train Loss = 6958.7443, Val Loss = 2123.2058
Epoch 9/20: Train Loss = 6163.5799, Val Loss = 2221.5022
Epoch 10/20: Train Loss = 5518.7026, Val Loss = 2302.8981
Epoch 11/20: Train Loss = 4931.7257, Val Loss = 2208.8004
Early stopping triggered!
Total training time: 257.90 seconds
POS Classification Report:
              precision    recall  f1-score   support

         abb     1.0000    0.9444    0.9714        18
         adj     0.8950    0.8840    0.8895       569
         adv     0.9866    0.8258    0.8991       356
        con