In [1]:
!pip install git+https://github.com/kmkurn/pytorch-crf.git

Collecting git+https://github.com/kmkurn/pytorch-crf.git
  Cloning https://github.com/kmkurn/pytorch-crf.git to /tmp/pip-req-build-4zcb6d_t
  Running command git clone --filter=blob:none --quiet https://github.com/kmkurn/pytorch-crf.git /tmp/pip-req-build-4zcb6d_t
  Resolved https://github.com/kmkurn/pytorch-crf.git to commit 623e3402d00a2728e99d6e8486010d67c754267b
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
import torchcrf
print(torchcrf.__version__)

0.7.2


In [3]:
pip install gputil

Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
import time

In [None]:
# Single NER Training
# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, ner_tags = [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, ner_tag = [], []
            for line in f:
                if line.strip():
                    word, _, ner = line.strip().split("\t")
                    sentence.append(word)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        ner_tags.append(ner_tag)
                    sentence, ner_tag = [], []
            if sentence:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
        return sentences, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    ner_tensors = []

    for s, n in zip(sentences, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(ner_tensors)

# Define BiLSTM Model with Softmax for NER
class BiLSTMSoftmax_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_ner_tags):
        super(BiLSTMSoftmax_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)

    def forward(self, x):
        embeddings = self.embedding(x)
        lstm_out, _ = self.bilstm(embeddings)
        ner_logits = self.ner_fc(lstm_out)
        return ner_logits

    def compute_loss(self, x, ner_tags):
        ner_logits = self.forward(x)
        loss_fn = nn.CrossEntropyLoss(ignore_index=ner_tag_to_ix["<PAD>"])
        ner_loss = loss_fn(ner_logits.view(-1, ner_logits.size(-1)), ner_tags.view(-1))
        return ner_loss

    def decode(self, x):
        ner_logits = self.forward(x)
        ner_tags = torch.argmax(ner_logits, dim=-1)
        return ner_tags

# Load data
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

vocab = {"<PAD>": 0, "<UNK>": 1}
ner_tag_to_ix = {"<PAD>": 0}

for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, ner_tags in zip(dataset.sentences, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Initialize model
embedding_dim = 128
hidden_dim = 256
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMSoftmax_NER(vocab_size, embedding_dim, hidden_dim, num_ner_tags).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
def train_model(model, train_loader, val_loader, epochs):
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, ner_tags in train_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, ner_tags)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, ner_tags in val_loader:
                sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, ner_tags).item()

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    end_time = time.time()
    training_time = end_time - start_time
    print(f"Training completed in {training_time:.2f} seconds.")

# Train the model
train_model(model, train_loader, val_loader, epochs=10)

# Evaluation with timing
model.eval()
all_ner_preds, all_ner_targets = [], []
prediction_start = time.time()
with torch.no_grad():
    for sentences, ner_tags in test_loader:
        sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
        ner_preds = model.decode(sentences)

        for i in range(len(sentences)):
            length = (sentences[i] != vocab["<PAD>"]).sum().item()
            all_ner_preds.extend(ner_preds[i, :length].tolist())
            all_ner_targets.extend(ner_tags[i, :length].tolist())

prediction_end = time.time()
prediction_time = prediction_end - prediction_start
print(f"Prediction completed in {prediction_time:.2f} seconds.")

# Map indices to tags
idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

def convert_indices_to_tags(indices, idx_to_tag):
    return [idx_to_tag[idx] for idx in indices if idx != ner_tag_to_ix["<PAD>"]]

ner_preds_tags = convert_indices_to_tags(all_ner_preds, idx_to_ner)
ner_targets_tags = convert_indices_to_tags(all_ner_targets, idx_to_ner)

# Print classification report
print("NER Classification Report:")
print(classification_report(ner_targets_tags, ner_preds_tags,zero_division=0, digits=4))

Epoch 1/10, Train Loss: 90.7267, Val Loss: 4.5394
Epoch 2/10, Train Loss: 33.6738, Val Loss: 3.2901
Epoch 3/10, Train Loss: 23.2366, Val Loss: 2.8356
Epoch 4/10, Train Loss: 16.6166, Val Loss: 2.6711
Epoch 5/10, Train Loss: 11.8033, Val Loss: 2.6069
Epoch 6/10, Train Loss: 8.0359, Val Loss: 2.7788
Epoch 7/10, Train Loss: 5.4195, Val Loss: 3.0423
Epoch 8/10, Train Loss: 3.7292, Val Loss: 2.9134
Epoch 9/10, Train Loss: 2.8159, Val Loss: 3.0479
Epoch 10/10, Train Loss: 2.2106, Val Loss: 3.1790
Training completed in 24.35 seconds.
Prediction completed in 0.30 seconds.
NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.8095    0.7727    0.7907        66
       B-LOC     0.9802    0.9619    0.9710      1182
       B-NUM     0.3333    0.1333    0.1905        15
       B-ORG     0.5946    0.4583    0.5176        48
       B-PER     0.8824    0.8824    0.8824        34
      B-TIME     0.5556    0.5556    0.5556         9
      E-DATE     0.8571

In [None]:
# JOint Traing

# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, pos_tags, ner_tags = [], [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, pos_tag, ner_tag = [], [], []
            for line in f:
                if line.strip():
                    word, pos, ner = line.strip().split("\t")
                    sentence.append(word)
                    pos_tag.append(pos)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_tag)
                        ner_tags.append(ner_tag)
                    sentence, pos_tag, ner_tag = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_tag)
                ner_tags.append(ner_tag)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.pos_tags[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    pos_tensors = []
    ner_tensors = []

    for s, p, n in zip(sentences, pos_tags, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_pos = p + ["<PAD>"] * (max_len - len(p))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        pos_tensors.append(torch.tensor([pos_tag_to_ix[tag] for tag in padded_pos], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(pos_tensors), torch.stack(ner_tensors)

# Define BiLSTM Model with Softmax
class BiLSTMSoftmax_Joint(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags):
        super(BiLSTMSoftmax_Joint, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.pos_fc = nn.Linear(hidden_dim * 2, num_pos_tags)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)

    def forward(self, x):
        embeddings = self.embedding(x)
        lstm_out, _ = self.bilstm(embeddings)
        pos_logits = self.pos_fc(lstm_out)
        ner_logits = self.ner_fc(lstm_out)
        return pos_logits, ner_logits

    def compute_loss(self, x, pos_tags, ner_tags, alpha=0.5):
        pos_logits, ner_logits = self.forward(x)
        pos_loss = nn.CrossEntropyLoss(ignore_index=pos_tag_to_ix["<PAD>"])(pos_logits.view(-1, pos_logits.size(-1)), pos_tags.view(-1))
        ner_loss = nn.CrossEntropyLoss(ignore_index=ner_tag_to_ix["<PAD>"])(ner_logits.view(-1, ner_logits.size(-1)), ner_tags.view(-1))
        return alpha * pos_loss + (1 - alpha) * ner_loss

    def decode(self, x):
        pos_logits, ner_logits = self.forward(x)
        pos_tags = torch.argmax(pos_logits, dim=-1)
        ner_tags = torch.argmax(ner_logits, dim=-1)
        return pos_tags, ner_tags

# Load data
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

vocab = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0}
ner_tag_to_ix = {"<PAD>": 0}

for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Initialize model
embedding_dim = 128
hidden_dim = 256
vocab_size = len(vocab)
num_pos_tags = len(pos_tag_to_ix)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMSoftmax_Joint(vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
def train_model(model, train_loader, val_loader, epochs):
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, pos_tags, ner_tags in train_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, pos_tags, ner_tags, alpha=0.5)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, pos_tags, ner_tags in val_loader:
                sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, pos_tags, ner_tags).item()

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    end_time = time.time()
    training_time = end_time - start_time
    print(f"Training completed in {training_time:.2f} seconds.")

# Train the model
train_model(model, train_loader, val_loader, epochs=10)

# Evaluation with timing
model.eval()
all_pos_preds, all_pos_targets, all_ner_preds, all_ner_targets = [], [], [], []
prediction_start = time.time()
with torch.no_grad():
    for sentences, pos_tags, ner_tags in test_loader:
        sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
        pos_preds, ner_preds = model.decode(sentences)

        for i in range(len(sentences)):
            length = (sentences[i] != vocab["<PAD>"]).sum().item()
            all_pos_preds.extend(pos_preds[i, :length].tolist())
            all_ner_preds.extend(ner_preds[i, :length].tolist())
            all_pos_targets.extend(pos_tags[i, :length].tolist())
            all_ner_targets.extend(ner_tags[i, :length].tolist())

prediction_end = time.time()
prediction_time = prediction_end - prediction_start
print(f"Prediction completed in {prediction_time:.2f} seconds.")

# Map indices to tags
idx_to_pos = {v: k for k, v in pos_tag_to_ix.items()}
idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

def convert_indices_to_tags(indices, idx_to_tag):
    return [idx_to_tag[idx] for idx in indices if idx != 0]

pos_preds_tags = convert_indices_to_tags(all_pos_preds, idx_to_pos)
pos_targets_tags = convert_indices_to_tags(all_pos_targets, idx_to_pos)
ner_preds_tags = convert_indices_to_tags(all_ner_preds, idx_to_ner)
ner_targets_tags = convert_indices_to_tags(all_ner_targets, idx_to_ner)

# Print classification reports
print("POS Classification Report:")
print(classification_report(pos_targets_tags, pos_preds_tags,zero_division=0, digits=4))

print("NER Classification Report:")
print(classification_report(ner_targets_tags, ner_preds_tags,zero_division=0, digits=4))

Epoch 1/10, Train Loss: 125.2499, Val Loss: 6.9331
Epoch 2/10, Train Loss: 44.0055, Val Loss: 4.7021
Epoch 3/10, Train Loss: 30.0742, Val Loss: 3.8780
Epoch 4/10, Train Loss: 21.9689, Val Loss: 3.4411
Epoch 5/10, Train Loss: 16.2488, Val Loss: 3.2199
Epoch 6/10, Train Loss: 11.5921, Val Loss: 3.1307
Epoch 7/10, Train Loss: 8.0973, Val Loss: 3.2098
Epoch 8/10, Train Loss: 5.4384, Val Loss: 3.2843
Epoch 9/10, Train Loss: 3.6980, Val Loss: 3.3282
Epoch 10/10, Train Loss: 2.6161, Val Loss: 3.5715
Training completed in 26.84 seconds.
Prediction completed in 0.37 seconds.
POS Classification Report:
              precision    recall  f1-score   support

         abb     1.0000    0.8333    0.9091        18
         adj     0.8441    0.8278    0.8358       569
         adv     0.8390    0.7612    0.7982       356
        conj     0.9302    0.9553    0.9426       739
          fw     0.4762    0.3077    0.3738        65
         int     0.9375    0.8824    0.9091        17
           n     0.96

In [None]:
# Single NER with fastText
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"
fasttext_model = load_facebook_model(fasttext_bin_file)
fasttext_vectors = fasttext_model.wv

# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, pos_tags, ner_tags = [], [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, pos_tag, ner_tag = [], [], []
            for line in f:
                if line.strip():
                    word, pos, ner = line.strip().split("\t")
                    sentence.append(word)
                    pos_tag.append(pos)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_tag)
                        ner_tags.append(ner_tag)
                    sentence, pos_tag, ner_tag = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_tag)
                ner_tags.append(ner_tag)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.pos_tags[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    pos_tensors = []
    ner_tensors = []

    for s, p, n in zip(sentences, pos_tags, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_pos = p + ["<PAD>"] * (max_len - len(p))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        pos_tensors.append(torch.tensor([pos_tag_to_ix[tag] for tag in padded_pos], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(pos_tensors), torch.stack(ner_tensors)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0}
ner_tag_to_ix = {"<PAD>": 0}

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)

# BiLSTM Model with Pre-trained Embeddings
class BiLSTMSoftmax_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings):
        super(BiLSTMSoftmax_NER, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=False)  # Set freeze=False for fine-tuning
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)

    def forward(self, x):
        embeddings = self.embedding(x)
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)
        ner_logits = self.ner_fc(lstm_out)
        return ner_logits

    def compute_loss(self, x, ner_tags):
        ner_logits = self.forward(x)
        loss_fn = nn.CrossEntropyLoss(ignore_index=ner_tag_to_ix["<PAD>"])
        ner_loss = loss_fn(ner_logits.view(-1, ner_logits.size(-1)), ner_tags.view(-1))
        return ner_loss

    def decode(self, x):
        ner_logits = self.forward(x)
        ner_tags = torch.argmax(ner_logits, dim=-1)
        return ner_tags

# Initialize Model
hidden_dim = 256
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMSoftmax_NER(vocab_size, embedding_dim, hidden_dim, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
def train_model(model, train_loader, val_loader, epochs):
    start_time = time.time()
    best_val_loss = float('inf')
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, _, ner_tags in train_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, ner_tags)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, _, ner_tags in val_loader:
                sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, ner_tags).item()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Save the best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "single_2.pth")

    end_time = time.time()
    training_time = end_time - start_time
    print(f"Training completed in {training_time:.2f} seconds.")

# Train the model
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

train_model(model, train_loader, val_loader, epochs=10)

# Evaluation with timing
model.load_state_dict(torch.load("single_2.pth"))
model.eval()
all_ner_preds, all_ner_targets = [], []
prediction_start = time.time()
with torch.no_grad():
    for sentences, _, ner_tags in test_loader:
        sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
        ner_preds = model.decode(sentences)

        for i in range(len(sentences)):
            length = (sentences[i] != vocab["<PAD>"]).sum().item()
            all_ner_preds.extend(ner_preds[i, :length].tolist())
            all_ner_targets.extend(ner_tags[i, :length].tolist())

prediction_end = time.time()
prediction_time = prediction_end - prediction_start
print(f"Prediction completed in {prediction_time:.2f} seconds.")

# Map indices to tags
idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

def convert_indices_to_tags(indices, idx_to_tag):
    return [idx_to_tag[idx] for idx in indices if idx != ner_tag_to_ix["<PAD>"]]

ner_preds_tags = convert_indices_to_tags(all_ner_preds, idx_to_ner)
ner_targets_tags = convert_indices_to_tags(all_ner_targets, idx_to_ner)

# Print classification report
print("NER Classification Report:")
print(classification_report(ner_targets_tags, ner_preds_tags, zero_division=0, digits=4))

Epoch 1/10, Train Loss: 0.5005, Val Loss: 0.1390
Epoch 2/10, Train Loss: 0.1165, Val Loss: 0.1024
Epoch 3/10, Train Loss: 0.0699, Val Loss: 0.0864
Epoch 4/10, Train Loss: 0.0467, Val Loss: 0.0811
Epoch 5/10, Train Loss: 0.0349, Val Loss: 0.0804
Epoch 6/10, Train Loss: 0.0267, Val Loss: 0.0837
Epoch 7/10, Train Loss: 0.0222, Val Loss: 0.0903
Epoch 8/10, Train Loss: 0.0188, Val Loss: 0.0944
Epoch 9/10, Train Loss: 0.0160, Val Loss: 0.1002
Epoch 10/10, Train Loss: 0.0137, Val Loss: 0.0979
Training completed in 33.85 seconds.


  model.load_state_dict(torch.load("best_model.pth"))


Prediction completed in 0.34 seconds.
NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.8871    0.8333    0.8594        66
       B-LOC     0.9803    0.9695    0.9749      1182
       B-NUM     0.2727    0.2000    0.2308        15
       B-ORG     0.5088    0.6042    0.5524        48
       B-PER     0.7949    0.9118    0.8493        34
      B-TIME     0.6667    0.6667    0.6667         9
      E-DATE     0.9483    0.8333    0.8871        66
       E-LOC     0.9754    0.9721    0.9737      1182
       E-NUM     0.4000    0.4000    0.4000        15
       E-ORG     0.6667    0.6250    0.6452        48
       E-PER     0.8824    0.8824    0.8824        34
      E-TIME     0.7778    0.7778    0.7778         9
      I-DATE     0.7500    0.8684    0.8049        38
       I-LOC     0.9721    0.9682    0.9701       503
       I-NUM     0.0000    0.0000    0.0000         0
       I-ORG     0.6087    0.3590    0.4516        39
      I-TIME    

In [None]:
# Joint Training fastText
# Load FastText binary model
fasttext_bin_file = "/kaggle/input/glove-100d/cc.my.300.bin"
fasttext_model = load_facebook_model(fasttext_bin_file)
fasttext_vectors = fasttext_model.wv

# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, pos_tags, ner_tags = [], [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, pos_tag, ner_tag = [], [], []
            for line in f:
                if line.strip():
                    word, pos, ner = line.strip().split("\t")
                    sentence.append(word)
                    pos_tag.append(pos)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_tag)
                        ner_tags.append(ner_tag)
                    sentence, pos_tag, ner_tag = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_tag)
                ner_tags.append(ner_tag)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.pos_tags[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    pos_tensors = []
    ner_tensors = []

    for s, p, n in zip(sentences, pos_tags, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_pos = p + ["<PAD>"] * (max_len - len(p))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        pos_tensors.append(torch.tensor([pos_tag_to_ix[tag] for tag in padded_pos], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(pos_tensors), torch.stack(ner_tensors)

# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0}
ner_tag_to_ix = {"<PAD>": 0}

# Load datasets
train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

# Build vocab and tag mappings
for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

# Create embedding matrix using FastText (dimension: 300)
embedding_dim = 300
fasttext_embeddings = torch.zeros((len(vocab), embedding_dim))

for word, idx in vocab.items():
    if word in fasttext_vectors:
        fasttext_embeddings[idx] = torch.tensor(fasttext_vectors[word])
    elif word == "<PAD>":
        fasttext_embeddings[idx] = torch.zeros(embedding_dim)
    else:
        fasttext_embeddings[idx] = torch.randn(embedding_dim)

# BiLSTM Model with Pre-trained Embeddings
class BiLSTMJointPOSNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings):
        super(BiLSTMJointPOSNER, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=False)  # Set freeze=False for fine-tuning
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.pos_fc = nn.Linear(hidden_dim * 2, num_pos_tags)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)

    def forward(self, x):
        embeddings = self.embedding(x)
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.dropout(lstm_out)
        pos_logits = self.pos_fc(lstm_out)
        ner_logits = self.ner_fc(lstm_out)
        return pos_logits, ner_logits

    def compute_loss(self, x, pos_tags, ner_tags):
        pos_logits, ner_logits = self.forward(x)
        pos_loss_fn = nn.CrossEntropyLoss(ignore_index=pos_tag_to_ix["<PAD>"])
        ner_loss_fn = nn.CrossEntropyLoss(ignore_index=ner_tag_to_ix["<PAD>"])
        pos_loss = pos_loss_fn(pos_logits.view(-1, pos_logits.size(-1)), pos_tags.view(-1))
        ner_loss = ner_loss_fn(ner_logits.view(-1, ner_logits.size(-1)), ner_tags.view(-1))
        return pos_loss + ner_loss

    def decode(self, x):
        pos_logits, ner_logits = self.forward(x)
        pos_tags = torch.argmax(pos_logits, dim=-1)
        ner_tags = torch.argmax(ner_logits, dim=-1)
        return pos_tags, ner_tags

# Initialize Model
hidden_dim = 256
vocab_size = len(vocab)
num_pos_tags = len(pos_tag_to_ix)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMJointPOSNER(vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags, fasttext_embeddings).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
def train_model(model, train_loader, val_loader, epochs):
    start_time = time.time()
    best_val_loss = float('inf')
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, pos_tags, ner_tags in train_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, pos_tags, ner_tags)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, pos_tags, ner_tags in val_loader:
                sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, pos_tags, ner_tags).item()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Save the best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "/kaggle/working/bilstm_softmax_joint.pth")

    end_time = time.time()
    training_time = end_time - start_time
    print(f"Training completed in {training_time:.2f} seconds.")

# Train the model
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

train_model(model, train_loader, val_loader, epochs=10)

# Evaluation with timing
model.load_state_dict(torch.load("/kaggle/working/bilstm_softmax_joint.pth"))
model.eval()
all_pos_preds, all_pos_targets = [], []
all_ner_preds, all_ner_targets = [], []

prediction_start = time.time()
with torch.no_grad():
    for sentences, pos_tags, ner_tags in test_loader:
        sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
        pos_preds, ner_preds = model.decode(sentences)

        for i in range(len(sentences)):
            length = (sentences[i] != vocab["<PAD>"]).sum().item()
            all_pos_preds.extend(pos_preds[i, :length].tolist())
            all_pos_targets.extend(pos_tags[i, :length].tolist())
            all_ner_preds.extend(ner_preds[i, :length].tolist())
            all_ner_targets.extend(ner_tags[i, :length].tolist())

prediction_end = time.time()
prediction_time = prediction_end - prediction_start
print(f"Prediction completed in {prediction_time:.2f} seconds.")

# Map indices to tags
idx_to_pos = {v: k for k, v in pos_tag_to_ix.items()}
idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

def convert_indices_to_tags(indices, idx_to_tag):
    return [idx_to_tag[idx] for idx in indices if idx != pos_tag_to_ix["<PAD>"]]

pos_preds_tags = convert_indices_to_tags(all_pos_preds, idx_to_pos)
pos_targets_tags = convert_indices_to_tags(all_pos_targets, idx_to_pos)
ner_preds_tags = convert_indices_to_tags(all_ner_preds, idx_to_ner)
ner_targets_tags = convert_indices_to_tags(all_ner_targets, idx_to_ner)

# Print classification reports
print("POS Classification Report:")
print(classification_report(pos_targets_tags, pos_preds_tags,zero_division=0, digits=4))

print("NER Classification Report:")
print(classification_report(ner_targets_tags, ner_preds_tags,zero_division=0, digits=4))

Epoch 1/10, Train Loss: 1.3696, Val Loss: 0.3542
Epoch 2/10, Train Loss: 0.2602, Val Loss: 0.2144
Epoch 3/10, Train Loss: 0.1491, Val Loss: 0.1865
Epoch 4/10, Train Loss: 0.1066, Val Loss: 0.1726
Epoch 5/10, Train Loss: 0.0836, Val Loss: 0.1763
Epoch 6/10, Train Loss: 0.0689, Val Loss: 0.1764
Epoch 7/10, Train Loss: 0.0578, Val Loss: 0.1748
Epoch 8/10, Train Loss: 0.0490, Val Loss: 0.1812
Epoch 9/10, Train Loss: 0.0417, Val Loss: 0.1790
Epoch 10/10, Train Loss: 0.0369, Val Loss: 0.1918
Training completed in 34.58 seconds.


  model.load_state_dict(torch.load("/kaggle/working/bilstm_softmax_joint.pth"))


Prediction completed in 0.40 seconds.
POS Classification Report:
              precision    recall  f1-score   support

         abb     1.0000    0.4444    0.6154        18
         adj     0.8946    0.8805    0.8875       569
         adv     0.9764    0.8118    0.8865       356
        conj     0.9486    0.9486    0.9486       739
          fw     0.7826    0.5538    0.6486        65
         int     0.9286    0.7647    0.8387        17
           n     0.9737    0.9879    0.9808      7694
         num     0.9968    0.9626    0.9794       641
        part     0.9835    0.9753    0.9794      4461
         ppm     0.9880    0.9971    0.9925      4114
        pron     0.9655    0.9593    0.9624       467
        punc     0.9997    1.0000    0.9998      2919
          sb     1.0000    0.9231    0.9600        13
          tn     0.9573    0.9345    0.9458       168
           v     0.9608    0.9658    0.9633      3302

    accuracy                         0.9765     25543
   macro avg   