In [1]:
!pip install git+https://github.com/kmkurn/pytorch-crf.git

Collecting git+https://github.com/kmkurn/pytorch-crf.git
  Cloning https://github.com/kmkurn/pytorch-crf.git to /tmp/pip-req-build-zf5pge64
  Running command git clone --filter=blob:none --quiet https://github.com/kmkurn/pytorch-crf.git /tmp/pip-req-build-zf5pge64
  Resolved https://github.com/kmkurn/pytorch-crf.git to commit 623e3402d00a2728e99d6e8486010d67c754267b
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytorch-crf
  Building wheel for pytorch-crf (setup.py) ... [?25l[?25hdone
  Created wheel for pytorch-crf: filename=pytorch_crf-0.7.2-py3-none-any.whl size=6410 sha256=47a8daa9f603dd1fd4d1dbf766aef7ad80f7369900515d5a943041cef424aab5
  Stored in directory: /tmp/pip-ephem-wheel-cache-7domka_u/wheels/39/5f/f6/4b48b35895d914f4f5fff5b600f87658c11693e37b6a4f118e
Successfully built pytorch-crf
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [2]:
import torchcrf
print(torchcrf.__version__)

0.7.2


In [3]:
pip install gputil

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=d8bb3519f5dfdbcee301d61e8bdac78fda0808e11c7cd8da79632a7e7844c68e
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchcrf import CRF
from sklearn.metrics import classification_report
import time

In [5]:
#Emb dim 128, hidden dim 128, batch load 32, epoch 10
# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, ner_tags = [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, ner_tag = [], []
            for line in f:
                if line.strip():
                    word, _, ner = line.strip().split("\t")
                    sentence.append(word)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        ner_tags.append(ner_tag)
                    sentence, ner_tag = [], []
            if sentence:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
        return sentences, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    ner_tensors = []

    for s, n in zip(sentences, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model
class BiLSTMCRF_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_ner_tags):
        super(BiLSTMCRF_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        lstm_out, _ = self.bilstm(embeddings)
        ner_logits = self.ner_fc(lstm_out)
        return ner_logits

    def compute_loss(self, x, ner_tags):
        ner_logits = self.forward(x)
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return ner_loss

    def decode(self, x):
        ner_logits = self.forward(x)
        ner_tags = self.ner_crf.decode(ner_logits)
        return ner_tags

# Paths to pre-split datasets
train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

vocab = {"<PAD>": 0, "<UNK>": 1}
ner_tag_to_ix = {"<PAD>": 0}

for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, ner_tags in zip(dataset.sentences, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Initialize model
embedding_dim = 128
hidden_dim = 128
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF_NER(vocab_size, embedding_dim, hidden_dim, num_ner_tags).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, val_loader, epochs):
    start_time = time.time()
    for epoch in range(epochs):
        epoch_start_time = time.time()
        model.train()
        train_loss = 0
        for sentences, ner_tags in train_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, ner_tags)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, ner_tags in val_loader:
                sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, ner_tags).item()

        epoch_end_time = time.time()
        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Epoch Time: {epoch_end_time - epoch_start_time:.2f}s")

    total_training_time = time.time() - start_time
    print(f"Total Training Time: {total_training_time:.2f}s")

    # Save the model
    torch.save(model.state_dict(), "bilstm_crf_ner_model(no_emb).pth")
    print("Model saved as bilstm_crf_ner_model.pth")

train_model(model, train_loader, val_loader, epochs=10)

# Evaluate on test set
def evaluate_model(model, test_loader):
    model.eval()
    all_ner_preds, all_ner_targets = [], []
    start_time = time.time()
    with torch.no_grad():
        for sentences, ner_tags in test_loader:
            sentences, ner_tags = sentences.to("cuda"), ner_tags.to("cuda")
            ner_preds = model.decode(sentences)

            for i, ner_seq in enumerate(ner_preds):
                length = (sentences[i] != vocab["<PAD>"]).sum().item()
                all_ner_preds.extend(ner_seq[:length])
                all_ner_targets.extend(ner_tags[i, :length].tolist())
    end_time = time.time()

    total_prediction_time = end_time - start_time
    print(f"Total Prediction Time: {total_prediction_time:.2f}s")

    # Convert indices to tags
    idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    def convert_indices_to_tags(indices, idx_to_tag):
        return [idx_to_tag[idx] for idx in indices if idx != 0]

    ner_preds_tags = convert_indices_to_tags(all_ner_preds, idx_to_ner)
    ner_targets_tags = convert_indices_to_tags(all_ner_targets, idx_to_ner)

    print("NER Classification Report:")
    print(classification_report(ner_targets_tags, ner_preds_tags,zero_division=0, digits=4))

evaluate_model(model, test_loader)


Epoch 1/10, Train Loss: 74621.8118, Val Loss: 4297.6095, Epoch Time: 21.06s
Epoch 2/10, Train Loss: 27532.0286, Val Loss: 3069.1376, Epoch Time: 19.86s
Epoch 3/10, Train Loss: 17713.8225, Val Loss: 2596.5268, Epoch Time: 20.17s
Epoch 4/10, Train Loss: 11614.7879, Val Loss: 2445.8801, Epoch Time: 20.15s
Epoch 5/10, Train Loss: 7494.1585, Val Loss: 2400.1904, Epoch Time: 19.94s
Epoch 6/10, Train Loss: 4740.3627, Val Loss: 2570.8025, Epoch Time: 20.03s
Epoch 7/10, Train Loss: 3191.0941, Val Loss: 2714.5150, Epoch Time: 20.01s
Epoch 8/10, Train Loss: 2171.1015, Val Loss: 2917.0572, Epoch Time: 20.04s
Epoch 9/10, Train Loss: 1586.2564, Val Loss: 2945.3762, Epoch Time: 20.01s
Epoch 10/10, Train Loss: 1277.6119, Val Loss: 2991.6491, Epoch Time: 19.90s
Total Training Time: 201.17s
Model saved as bilstm_crf_ner_model.pth


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Total Prediction Time: 1.68s
NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.9057    0.7273    0.8067        66
       B-LOC     0.9655    0.9695    0.9675      1182
       B-NUM     0.3333    0.1333    0.1905        15
       B-ORG     0.6364    0.5833    0.6087        48
       B-PER     0.7879    0.7647    0.7761        34
      B-TIME     0.7778    0.7778    0.7778         9
      E-DATE     0.9074    0.7424    0.8167        66
       E-LOC     0.9695    0.9679    0.9687      1182
       E-NUM     0.4000    0.1333    0.2000        15
       E-ORG     0.6042    0.6042    0.6042        48
       E-PER     0.8065    0.7353    0.7692        34
      E-TIME     0.7778    0.7778    0.7778         9
      I-DATE     0.8108    0.7895    0.8000        38
       I-LOC     0.9590    0.9761    0.9675       503
       I-ORG     0.8000    0.4103    0.5424        39
           O     0.9843    0.9902    0.9873     21324
      S-DATE     0.9467  

In [11]:
#Emb dim 128, hidden dim 128, batch load 32, epoch 10
# Define Dataset Class
class CoNLLDataset(Dataset):
    def __init__(self, file_path):
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(file_path)

    def load_data(self, file_path):
        sentences, pos_tags, ner_tags = [], [], []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence, pos_tag, ner_tag = [], [], []
            for line in f:
                if line.strip():
                    word, pos, ner = line.strip().split("\t")
                    sentence.append(word)
                    pos_tag.append(pos)
                    ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_tag)
                        ner_tags.append(ner_tag)
                    sentence, pos_tag, ner_tag = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_tag)
                ner_tags.append(ner_tag)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.pos_tags[idx], self.ner_tags[idx]

# Collate function for dynamic padding
def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sentence_tensors = []
    pos_tensors = []
    ner_tensors = []

    for s, p, n in zip(sentences, pos_tags, ner_tags):
        padded_sentence = s + ["<PAD>"] * (max_len - len(s))
        padded_pos = p + ["<PAD>"] * (max_len - len(p))
        padded_ner = n + ["<PAD>"] * (max_len - len(n))

        sentence_tensors.append(torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in padded_sentence], dtype=torch.long))
        pos_tensors.append(torch.tensor([pos_tag_to_ix[tag] for tag in padded_pos], dtype=torch.long))
        ner_tensors.append(torch.tensor([ner_tag_to_ix[tag] for tag in padded_ner], dtype=torch.long))

    return torch.stack(sentence_tensors), torch.stack(pos_tensors), torch.stack(ner_tensors)

# Define BiLSTM-CRF Model
class BiLSTMCRF_Joint(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags):
        super(BiLSTMCRF_Joint, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.pos_fc = nn.Linear(hidden_dim * 2, num_pos_tags)
        self.ner_fc = nn.Linear(hidden_dim * 2, num_ner_tags)
        self.pos_crf = CRF(num_pos_tags, batch_first=True)
        self.ner_crf = CRF(num_ner_tags, batch_first=True)

    def forward(self, x):
        embeddings = self.embedding(x)
        lstm_out, _ = self.bilstm(embeddings)
        pos_logits = self.pos_fc(lstm_out)
        ner_logits = self.ner_fc(lstm_out)
        return pos_logits, ner_logits

    def compute_loss(self, x, pos_tags, ner_tags, alpha=0.5):
        pos_logits, ner_logits = self.forward(x)
        pos_loss = -self.pos_crf(pos_logits, pos_tags, mask=(x != vocab["<PAD>"]))
        ner_loss = -self.ner_crf(ner_logits, ner_tags, mask=(x != vocab["<PAD>"]))
        return alpha * pos_loss + (1 - alpha) * ner_loss

    def decode(self, x):
        pos_logits, ner_logits = self.forward(x)
        pos_tags = self.pos_crf.decode(pos_logits)
        ner_tags = self.ner_crf.decode(ner_logits)
        return pos_tags, ner_tags

train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

train_dataset = CoNLLDataset(train_file_path)
val_dataset = CoNLLDataset(val_file_path)
test_dataset = CoNLLDataset(test_file_path)

vocab = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0}
ner_tag_to_ix = {"<PAD>": 0}

for dataset in [train_dataset, val_dataset, test_dataset]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Initialize model
embedding_dim = 128
hidden_dim = 128
vocab_size = len(vocab)
num_pos_tags = len(pos_tag_to_ix)
num_ner_tags = len(ner_tag_to_ix)

model = BiLSTMCRF_Joint(vocab_size, embedding_dim, hidden_dim, num_pos_tags, num_ner_tags).to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.001)

#training
def train_model(model, train_loader, val_loader, epochs, save_path):
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for sentences, pos_tags, ner_tags in train_loader:
            sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
            optimizer.zero_grad()
            loss = model.compute_loss(sentences, pos_tags, ner_tags, alpha=0.5)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for sentences, pos_tags, ner_tags in val_loader:
                sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
                val_loss += model.compute_loss(sentences, pos_tags, ner_tags).item()

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    end_time = time.time()
    training_time = end_time - start_time
    print(f"Training completed in {training_time:.2f} seconds.")
    
    # Save the trained model
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

# Define the path to save the model
save_model_path = "bilstm_crf_joint_model.pth"

# Train the model
train_model(model, train_loader, val_loader, epochs=10, save_path=save_model_path)

# Evaluation with timing
model.eval()
all_pos_preds, all_pos_targets, all_ner_preds, all_ner_targets = [], [], [], []
prediction_start = time.time()
with torch.no_grad():
    for sentences, pos_tags, ner_tags in test_loader:
        sentences, pos_tags, ner_tags = sentences.to("cuda"), pos_tags.to("cuda"), ner_tags.to("cuda")
        pos_preds, ner_preds = model.decode(sentences)

        for i, (pos_seq, ner_seq) in enumerate(zip(pos_preds, ner_preds)):
            length = (sentences[i] != vocab["<PAD>"]).sum().item()
            all_pos_preds.extend(pos_seq[:length])
            all_ner_preds.extend(ner_seq[:length])
            all_pos_targets.extend(pos_tags[i, :length].tolist())
            all_ner_targets.extend(ner_tags[i, :length].tolist())

prediction_end = time.time()
prediction_time = prediction_end - prediction_start
print(f"Prediction completed in {prediction_time:.2f} seconds.")

# Map indices to tags
idx_to_pos = {v: k for k, v in pos_tag_to_ix.items()}
idx_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

def convert_indices_to_tags(indices, idx_to_tag):
    return [idx_to_tag[idx] for idx in indices if idx != 0]

pos_preds_tags = convert_indices_to_tags(all_pos_preds, idx_to_pos)
pos_targets_tags = convert_indices_to_tags(all_pos_targets, idx_to_pos)
ner_preds_tags = convert_indices_to_tags(all_ner_preds, idx_to_ner)
ner_targets_tags = convert_indices_to_tags(all_ner_targets, idx_to_ner)

# Print classification reports with precision formatting
print("POS Classification Report:")
print(classification_report(pos_targets_tags, pos_preds_tags, digits=4))

print("NER Classification Report:")
print(classification_report(ner_targets_tags, ner_preds_tags, digits=4))


Epoch 1/10, Train Loss: 104722.4262, Val Loss: 5820.4296
Epoch 2/10, Train Loss: 35709.2739, Val Loss: 3888.3945
Epoch 3/10, Train Loss: 23553.7400, Val Loss: 3206.9156
Epoch 4/10, Train Loss: 16536.2546, Val Loss: 2876.3075
Epoch 5/10, Train Loss: 11613.5927, Val Loss: 2751.7534
Epoch 6/10, Train Loss: 7988.1992, Val Loss: 2763.1847
Epoch 7/10, Train Loss: 5345.6169, Val Loss: 2736.0782
Epoch 8/10, Train Loss: 3577.6333, Val Loss: 2882.9834
Epoch 9/10, Train Loss: 2379.0356, Val Loss: 2963.6501
Epoch 10/10, Train Loss: 1622.9802, Val Loss: 3167.1150
Training completed in 383.54 seconds.
Model saved to bilstm_crf_joint_model.pth
Prediction completed in 3.15 seconds.
POS Classification Report:
              precision    recall  f1-score   support

         abb     0.9412    0.8889    0.9143        18
         adj     0.8559    0.8559    0.8559       569
         adv     0.8519    0.7753    0.8118       356
        conj     0.9362    0.9526    0.9443       739
          fw     0.3860    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
