In [1]:
!pip install -q datasets accelerate
!wget -q https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import datasets
from conlleval import evaluate
import itertools
from collections import Counter

load data

In [3]:
dataset_ori = datasets.load_dataset("conll2003")

word_frequency = Counter(itertools.chain(*dataset_ori['train']['tokens']))  # type: ignore

# Remove words below threshold 3
word_frequency = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

idx2tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


dataset = (
    dataset_ori.map(lambda x: {
            'input_ids': [
                word2idx.get(word, word2idx['[UNK]'])
                for word in x['tokens']
            ]
        },
         remove_columns=['id', 'pos_tags', 'chunk_tags']
    )
)

dataset = dataset.rename_column('ner_tags', 'labels')


Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [96]:
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch

# Define custom dataset class for train set
class BiLSTM_dataloader(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __getitem__(self,idx):
    return torch.tensor(self.x[idx]), torch.tensor(self.y[idx])

  def __len__(self):
    return len(self.x)

def collate(data):
    tensors, targets = zip(*data)
    # padding
    features = pad_sequence(tensors, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=9)
    return features, targets


trainset = BiLSTM_dataloader(dataset['train']['input_ids'], dataset['train']['labels'])
validset = BiLSTM_dataloader(dataset['validation']['input_ids'], dataset['validation']['labels'])
testset = BiLSTM_dataloader(dataset['test']['input_ids'], dataset['test']['labels'])

transformer model

In [121]:
# parameters
embedding_dim = 128
num_heads = 8
max_length = 128
ff_dim = 128

dropout = 0.1

In [152]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim.lr_scheduler as lr_scheduler
import math
from torch import Tensor

#  use gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# position embedding
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, dropout, max_len):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, embedding_dim, 2)* math.log(10000) / embedding_dim)
        pos = torch.arange(0, max_len).reshape(max_len, 1)
        pos_embedding = torch.zeros((max_len, embedding_dim))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
        # return (token_embedding + self.pos_embedding[:token_embedding.size(0), :])


# token embedding
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.embedding_dim)

def create_mask(src, device):
    src_seq_len = src.shape[0]
    # tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    src_padding_mask = (src == 0).transpose(0, 1)
    # tgt_padding_mask = (tgt == 9).transpose(0, 1)
    return src_mask, src_padding_mask

# transformer model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_heads, ff_dim, num_tags, max_length, dropout):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=n_heads, dim_feedforward=ff_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)
        self.positional_encoding = PositionalEncoding(embedding_dim, dropout, max_length)
        self.src_tok_emb = TokenEmbedding(vocab_size, embedding_dim)
        # self.tgt_tok_emb = TokenEmbedding(num_tags, embedding_dim)
        self.layer_norm = nn.LayerNorm(embedding_dim)
        self.classifier = nn.Linear(embedding_dim, num_tags)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_mask, src_key_padding_mask, labels=None):
        src_emb = self.positional_encoding(self.src_tok_emb(x))
        # tgt_emb = self.positional_encoding(self.tgt_tok_emb(labels))
        out = self.transformer_encoder(src_emb, src_mask, src_key_padding_mask)
        out = self.dropout(out)
        # out = self.layer_norm(out)
        out = self.classifier(out)

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(out.permute(0, 2, 1), labels, ignore_index=9)
            # loss = nn.functional.cross_entropy(out, labels, ignore_index=9)

        return out, loss

    # def encode(self, src, src_mask):
    #     return self.transformer.encoder(self.positional_encoding(
    #                         self.src_tok_emb(src)), src_mask)

    # def decode(self, tgt, memory, tgt_mask):
    #     return self.transformer.decoder(self.positional_encoding(
    #                       self.tgt_tok_emb(tgt)), memory,
    #                       tgt_mask)


In [153]:
Transformer_model = TransformerModel(vocab_size=len(word2idx),
                                     embedding_dim=embedding_dim,
                                     n_heads=num_heads,
                                     ff_dim=ff_dim,
                                     num_tags=len(idx2tag),
                                     max_length=max_length,
                                     dropout=dropout,
                                     ).to(device)

# print(Transformer_model)

#DataLoader
batch_size = 4
trainloader = DataLoader(trainset, collate_fn=collate, batch_size=batch_size, shuffle=True)
validloader = DataLoader(validset, collate_fn=collate, batch_size=1, shuffle=False)
testloader = DataLoader(testset, collate_fn=collate, batch_size=1, shuffle=False)

# optimization
lr = 1e-3
# optimizer = torch.optim.Adam(BiLSTM_model.parameters(), lr=lr)
# optimizer = torch.optim.SGD(Transformer_model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-2)
optimizer = torch.optim.AdamW(Transformer_model.parameters(), lr=lr)
# optimizer = torch.optim.Adam(Transformer_model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

# learning rate scheduler
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.2, patience=2,min_lr=1e-5)

# training the network
# number of epochs to train the model
n_epochs = 15

for epoch in range(n_epochs):
    Transformer_model.train()
    total_loss = 0
    # for data, label, len_data, len_label in trainloader:
    for data, label in trainloader:
        data, label = data.to(device), label.to(device)
        src_mask, src_padding_mask = create_mask(data, device)
        optimizer.zero_grad()
        output, loss = Transformer_model(data, src_mask, src_padding_mask, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(Transformer_model.parameters(), max_norm=1.0)
        optimizer.step()
        # scheduler.step()
        total_loss += loss.item() * data.size(1)

    Transformer_model.eval()
    with torch.no_grad():
      total_val_loss = 0
      for data, label in validloader:
          data, label = data.to(device), label.to(device)
          src_mask, src_padding_mask = create_mask(data, device)
          output, loss = Transformer_model(data, src_mask, src_padding_mask, label)
          # scheduler.step()
          total_val_loss += loss.item() * data.size(1)


    scheduler.step(total_val_loss)
    # Print loss for every epoch
    average_loss = total_loss / len(trainloader)
    avg_val_loss = total_val_loss / len(validloader)
    print(f"Epoch [{epoch + 1}/{n_epochs}], Training Loss: {average_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

Epoch [1/15], Training Loss: 15.7185, Validation Loss: 6.6690
Epoch [2/15], Training Loss: 10.8848, Validation Loss: 5.7069
Epoch [3/15], Training Loss: 8.8246, Validation Loss: 5.1165
Epoch [4/15], Training Loss: 7.6934, Validation Loss: 5.0484
Epoch [5/15], Training Loss: 7.0975, Validation Loss: 4.9634
Epoch [6/15], Training Loss: 6.7508, Validation Loss: 4.7975
Epoch [7/15], Training Loss: 6.4512, Validation Loss: 4.8078
Epoch [8/15], Training Loss: 6.3358, Validation Loss: 4.8281
Epoch [9/15], Training Loss: 6.1770, Validation Loss: 4.9480
Epoch [10/15], Training Loss: 5.8179, Validation Loss: 4.7201
Epoch [11/15], Training Loss: 5.6776, Validation Loss: 4.7457
Epoch [12/15], Training Loss: 5.5924, Validation Loss: 4.7476
Epoch [13/15], Training Loss: 5.6070, Validation Loss: 4.7032
Epoch [14/15], Training Loss: 5.5756, Validation Loss: 4.7350
Epoch [15/15], Training Loss: 5.4975, Validation Loss: 4.6841


evaluate

In [155]:
import numpy as np

# evaluate on the validation set
valid_pred = []

Transformer_model.eval()
with torch.no_grad():
  for data, label in validloader:
      data = data.to(device)
      src_mask, src_padding_mask = create_mask(data, device)
      pred, loss = Transformer_model(data, src_mask, src_padding_mask)

      pred = pred.cpu()
      pred = pred.detach().numpy()
      label = label.detach().numpy()
      pred = np.argmax(pred, axis=2)
      pred = pred.reshape((len(label), -1))[0]
      valid_pred.append(pred.tolist())

# transform labels into NER tags
valid_true = [
  list(map(idx2tag.get, labels))
  for labels in dataset['validation']['labels']
]


valid_pred = [
    list(map(idx2tag.get, pred))
    for pred in valid_pred
]

print("Evaluation on the validation set:")
precision, recall, f1 = evaluate(
  itertools.chain(*valid_true),
  itertools.chain(*valid_pred))


Evaluation on the validation set:
processed 51362 tokens with 5942 phrases; found: 4957 phrases; correct: 3036.
accuracy:  52.13%; (non-O)
accuracy:  91.82%; precision:  61.25%; recall:  51.09%; FB1:  55.71
              LOC: precision:  79.16%; recall:  71.75%; FB1:  75.27  1665
             MISC: precision:  69.12%; recall:  65.29%; FB1:  67.15  871
              ORG: precision:  52.24%; recall:  46.01%; FB1:  48.93  1181
              PER: precision:  40.24%; recall:  27.09%; FB1:  32.38  1240


In [156]:
# evaluate on the test set
test_pred = []

Transformer_model.eval()
with torch.no_grad():
  for data, label in testloader:
      data = data.to(device)
      src_mask, src_padding_mask = create_mask(data, device)
      pred, loss = Transformer_model(data, src_mask, src_padding_mask)

      pred = pred.cpu()
      pred = pred.detach().numpy()
      label = label.detach().numpy()
      pred = np.argmax(pred, axis=2)
      pred = pred.reshape((len(label), -1))[0]
      test_pred.append(pred.tolist())

# transform labels into NER tags
test_true = [
  list(map(idx2tag.get, labels))
  for labels in dataset['test']['labels']
]


test_pred = [
    list(map(idx2tag.get, pred))
    for pred in test_pred
]

print("Evaluation on the test set:")
precision, recall, f1 = evaluate(
  itertools.chain(*test_true),
  itertools.chain(*test_pred))


Evaluation on the test set:
processed 46435 tokens with 5648 phrases; found: 4191 phrases; correct: 2341.
accuracy:  43.81%; (non-O)
accuracy:  89.76%; precision:  55.86%; recall:  41.45%; FB1:  47.59
              LOC: precision:  74.79%; recall:  70.44%; FB1:  72.55  1571
             MISC: precision:  57.73%; recall:  56.41%; FB1:  57.06  686
              ORG: precision:  48.83%; recall:  35.16%; FB1:  40.88  1196
              PER: precision:  25.20%; recall:  11.50%; FB1:  15.80  738


In [157]:
# torch.save(Transformer_model.state_dict(), 'Transformer_model.pt')