In [1]:
# !pip install -q datasets
!pip install -q datasets accelerate
!wget -q https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/493.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/493.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import datasets
from conlleval import evaluate
import itertools
from collections import Counter

# Data Preprocessing

In [3]:
dataset_ori = datasets.load_dataset("conll2003")

word_frequency = Counter(itertools.chain(*dataset_ori['train']['tokens']))  # type: ignore

# Remove words below threshold 3
word_frequency = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

idx2tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


dataset = (
    dataset_ori.map(lambda x: {
            'input_ids': [
                word2idx.get(word, word2idx['[UNK]'])
                for word in x['tokens']
            ]
        },
         remove_columns=['id', 'pos_tags', 'chunk_tags']
    )
)

dataset = dataset.rename_column('ner_tags', 'labels')


Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [4]:
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch

# Define custom dataset class for train set
class BiLSTM_dataloader(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __getitem__(self,idx):
    return torch.tensor(self.x[idx]), torch.tensor(self.y[idx])

  def __len__(self):
    return len(self.x)

def collate(data):
    tensors, targets = zip(*data)
    len_tensors = [len(x) for x in tensors]
    len_targets = [len(y) for y in targets]
    # padding
    features = pad_sequence(tensors, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=9)
    # return features, targets
    return features, targets, len_tensors, len_targets


trainset = BiLSTM_dataloader(dataset['train']['input_ids'], dataset['train']['labels'])
validset = BiLSTM_dataloader(dataset['validation']['input_ids'], dataset['validation']['labels'])
testset = BiLSTM_dataloader(dataset['test']['input_ids'], dataset['test']['labels'])


# Task 1: Bidirectional LSTM model

In [5]:
# parameters
embedding_dim = 100
lstm_layers = 1
hidden_dim = 256
dropout = 0.33
output_dim = 128

In [6]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim.lr_scheduler as lr_scheduler

#  use gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, lstm_layers, num_tags):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm_layers = lstm_layers
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=lstm_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.elu = nn.ELU(alpha=0.01)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(output_dim, num_tags)

    def forward(self, x, labels=None):
        x = self.embedding(x)
        out, _ = self.lstm(x)

        out = self.dropout(out)
        out = self.fc(out)
        out = self.elu(out)
        out = self.classifier(out)

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(out.permute(0, 2, 1), labels, ignore_index=9)

        return out, loss



In [16]:
# create the model
BiLSTM_model = BiLSTM(
    vocab_size=len(word2idx),
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    dropout=dropout,
    lstm_layers=lstm_layers,
    num_tags=9
).to(device)
print(BiLSTM_model)

#DataLoader
batch_size = 8
trainloader = DataLoader(trainset, collate_fn=collate, batch_size=batch_size, shuffle=True)
validloader = DataLoader(validset, collate_fn=collate, batch_size=1, shuffle=False)
testloader = DataLoader(testset, collate_fn=collate, batch_size=1, shuffle=False)


# optimization
lr = 1e-3
# optimizer = torch.optim.Adam(BiLSTM_model.parameters(), lr=lr)
optimizer = torch.optim.AdamW(BiLSTM_model.parameters(), lr=lr)

# learning rate scheduler
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.7)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.2, patience=2,min_lr=1e-5)

# training the network
# number of epochs to train the model
n_epochs = 20

for epoch in range(n_epochs):
    BiLSTM_model.train()
    total_loss = 0
    for data, label, len_data, len_label in trainloader:
    # for data, label in trainloader:
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        output, loss = BiLSTM_model(data, label)
        loss.backward()
        optimizer.step()
        # scheduler.step()
        total_loss += loss.item() * data.size(1)

    scheduler.step(total_loss)
    # Print loss for every epoch
    average_loss = total_loss / len(trainloader)
    print(f"Epoch [{epoch + 1}/{n_epochs}], Loss: {average_loss:.4f}")

BiLSTM(
  (embedding): Embedding(8128, 100, padding_idx=0)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=0.01)
  (dropout): Dropout(p=0.33, inplace=False)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)
Epoch [1/20], Loss: 12.8010
Epoch [2/20], Loss: 4.7442
Epoch [3/20], Loss: 2.7945
Epoch [4/20], Loss: 1.8889
Epoch [5/20], Loss: 1.4203
Epoch [6/20], Loss: 1.0280
Epoch [7/20], Loss: 0.8116
Epoch [8/20], Loss: 0.6778
Epoch [9/20], Loss: 0.5696
Epoch [10/20], Loss: 0.4867
Epoch [11/20], Loss: 0.4624
Epoch [12/20], Loss: 0.4026
Epoch [13/20], Loss: 0.3571
Epoch [14/20], Loss: 0.3467
Epoch [15/20], Loss: 0.2999
Epoch [16/20], Loss: 0.2846
Epoch [17/20], Loss: 0.2725
Epoch [18/20], Loss: 0.2813
Epoch [19/20], Loss: 0.2558
Epoch [20/20], Loss: 0.2585


evaluate

In [17]:
import numpy as np

# evaluate on the validation set
valid_pred = []

BiLSTM_model.eval()
with torch.no_grad():
  for data, label, len_data, len_label in validloader:
      pred, loss = BiLSTM_model(data.to(device))

      pred = pred.cpu()
      pred = pred.detach().numpy()
      label = label.detach().numpy()
      pred = np.argmax(pred, axis=2)
      pred = pred.reshape((len(label), -1))[0]
      valid_pred.append(pred.tolist())

# transform labels into NER tags
valid_true = [
  list(map(idx2tag.get, labels))
  for labels in dataset['validation']['labels']
]


valid_pred = [
    list(map(idx2tag.get, pred))
    for pred in valid_pred
]

print("Evaluation on the validation set:")
precision, recall, f1 = evaluate(
  itertools.chain(*valid_true),
  itertools.chain(*valid_pred))


Evaluation on the validation set:
processed 51362 tokens with 5942 phrases; found: 5712 phrases; correct: 4690.
accuracy:  81.27%; (non-O)
accuracy:  96.18%; precision:  82.11%; recall:  78.93%; FB1:  80.49
              LOC: precision:  91.43%; recall:  85.90%; FB1:  88.58  1726
             MISC: precision:  77.07%; recall:  75.81%; FB1:  76.44  907
              ORG: precision:  75.67%; recall:  72.11%; FB1:  73.84  1278
              PER: precision:  80.29%; recall:  78.50%; FB1:  79.39  1801


In [18]:
# evaluate on the test set
test_pred = []

BiLSTM_model.eval()
with torch.no_grad():
  for data, label, len_data, len_label in testloader:
      pred, loss = BiLSTM_model(data.to(device))

      pred = pred.cpu()
      pred = pred.detach().numpy()
      label = label.detach().numpy()
      pred = np.argmax(pred, axis=2)
      pred = pred.reshape((len(label), -1))[0]
      test_pred.append(pred.tolist())

# transform labels into NER tags
test_true = [
  list(map(idx2tag.get, labels))
  for labels in dataset['test']['labels']
]


test_pred = [
    list(map(idx2tag.get, pred))
    for pred in test_pred
]

print("Evaluation on the test set:")
precision, recall, f1 = evaluate(
  itertools.chain(*test_true),
  itertools.chain(*test_pred))

Evaluation on the test set:
processed 46435 tokens with 5648 phrases; found: 5316 phrases; correct: 3991.
accuracy:  74.47%; (non-O)
accuracy:  94.33%; precision:  75.08%; recall:  70.66%; FB1:  72.80
              LOC: precision:  85.88%; recall:  78.78%; FB1:  82.18  1530
             MISC: precision:  65.07%; recall:  65.81%; FB1:  65.44  710
              ORG: precision:  71.01%; recall:  66.35%; FB1:  68.60  1552
              PER: precision:  73.03%; recall:  68.83%; FB1:  70.87  1524


save the model

In [19]:
# torch.save(BiLSTM_model.state_dict(), 'BiLSTM_model.pt')

# Task 2: Using GloVe word embeddings

download the glove embeddings

In [14]:
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
# !jar xvf glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [227]:
import numpy as np

vocab,embeddings = [],[]
embeddings_dict = {}
word2idx_glove = {}
index = 2
with open('glove.6B.100d.txt','rt') as f:
    full_content = f.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    embeddings_dict[i_word] = i_embeddings
    word2idx_glove[i_word] = index
    index += 1
    vocab.append(i_word)
    embeddings.append(i_embeddings)


vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)
word2idx_glove['<pad>'] = 0
word2idx_glove['<unk>'] = 1
print(len(word2idx_glove))

n_char_feature = 3

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
char_embs_npa = np.hstack((embs_npa, np.ones((len(embs_npa), 1)), np.zeros((len(embs_npa), 2)))) # all lowercase
print(char_embs_npa.shape)
case_features = []
unique_words = []
for word, idx in word2idx.items():
    if idx <= 1 or word.islower() or word in word2idx_glove:
        continue
    word_lower = word.lower()
    if word_lower in embeddings_dict:
        case_feature = np.array([0, 0, 0])
        if word.isupper(): # all upper case
            case_feature[1] = 1  # [0, 1, 0]
        elif word.istitle(): # start with capital
            case_feature[2] = 1  #[0, 0, 1]

        case_features.append(case_feature)
        unique_words.append(word)
        word2idx_glove[word] = index
        index += 1

chosen_embs = [embeddings_dict.get(w.lower()) for w in unique_words]
char_embs_npa = np.vstack((char_embs_npa, np.hstack([chosen_embs, case_features])))
print(char_embs_npa.shape)

400002
(400002, 103)
(403315, 103)


In [228]:
print(len(word2idx))
print(len(word2idx_glove))

8128
403315


In [233]:
# use glove vocab to
dataset_lower = (
    dataset_ori.map(lambda x: {
            'input_ids': [
                word2idx_glove.get(word.lower(), word2idx_glove['<unk>'])
                for word in x['tokens']
            ]
        },
         remove_columns=['id', 'pos_tags', 'chunk_tags']
    )
)

dataset_lower = dataset_lower.rename_column('ner_tags', 'labels')

# dataset['train']['labels'][:2]

trainset = BiLSTM_dataloader(dataset_lower['train']['input_ids'], dataset_lower['train']['labels'])
validset = BiLSTM_dataloader(dataset_lower['validation']['input_ids'], dataset_lower['validation']['labels'])
testset = BiLSTM_dataloader(dataset_lower['test']['input_ids'], dataset_lower['test']['labels'])

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [230]:
class BiLSTM_GloVe(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, lstm_layers, num_tags, embs_matrix):
        super(BiLSTM_GloVe, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs_matrix).float())
        self.lstm_layers = lstm_layers
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=lstm_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.elu = nn.ELU(alpha=0.01)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(output_dim, num_tags)

    def forward(self, x, labels=None):
        x = self.embedding(x)
        out, _ = self.lstm(x)

        out = self.dropout(out)
        out = self.fc(out)
        out = self.elu(out)
        out = self.classifier(out)

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(out.permute(0, 2, 1), labels, ignore_index=9)

        return out, loss

In [238]:
BiLSTM_GloVe_model = BiLSTM_GloVe(
    embedding_dim=embedding_dim + n_char_feature,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    dropout=dropout,
    lstm_layers=lstm_layers,
    num_tags=9,
    embs_matrix=char_embs_npa
).to(device)
print(BiLSTM_GloVe_model)

batch_size = 32
trainloader = DataLoader(trainset, collate_fn=collate, batch_size=batch_size, shuffle=True, drop_last=True)
validloader = DataLoader(validset, collate_fn=collate, batch_size=1, drop_last=True, shuffle=False)
testloader = DataLoader(testset, collate_fn=collate, batch_size=1, drop_last=True, shuffle=False)


# optimization
lr = 1e-2
# optimizer = torch.optim.Adam(BiLSTM_GloVe_model.parameters(), lr=lr)
optimizer = torch.optim.AdamW(BiLSTM_GloVe_model.parameters(), lr=lr, weight_decay=1e-3)
# optimizer = torch.optim.SGD(BiLSTM_GloVe_model.parameters(), lr=0.1, momentum=0.9)

# training the network
# number of epochs
n_epochs = 5

# freeze embeddings
BiLSTM_GloVe_model.embedding.weight.requires_grad = unfrozen = False

for epoch in range(n_epochs):
    BiLSTM_GloVe_model.train()
    total_loss = 0
    for data, label, len_data, len_label in trainloader:
    # for data, label in trainloader:
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        output, loss = BiLSTM_GloVe_model(data, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.size(1)

    # scheduler.step(total_loss)
    # Print loss for every epoch
    average_loss = total_loss / len(trainloader)
    print(f"Epoch [{epoch + 1}/{n_epochs}], Loss: {average_loss:.4f}")


BiLSTM_GloVe(
  (embedding): Embedding(403315, 103)
  (lstm): LSTM(103, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=0.01)
  (dropout): Dropout(p=0.33, inplace=False)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)
Epoch [1/5], Loss: 8.7624
Epoch [2/5], Loss: 4.1453
Epoch [3/5], Loss: 3.3002
Epoch [4/5], Loss: 2.7177
Epoch [5/5], Loss: 2.5076


In [239]:
# evaluate on the validation set
valid_pred = []

BiLSTM_GloVe_model.eval()
with torch.no_grad():
  for data, label, len_data, len_label in validloader:
      pred, loss = BiLSTM_GloVe_model(data.to(device))

      pred = pred.cpu()
      pred = pred.detach().numpy()
      label = label.detach().numpy()
      pred = np.argmax(pred, axis=2)
      pred = pred.reshape((len(label), -1))[0]
      valid_pred.append(pred.tolist())

# transform labels into NER tags
valid_true = [
  list(map(idx2tag.get, labels))
  for labels in dataset['validation']['labels']
]


valid_pred = [
    list(map(idx2tag.get, pred))
    for pred in valid_pred
]

print("Evaluation on the validation set:")
precision, recall, f1 = evaluate(
  itertools.chain(*valid_true),
  itertools.chain(*valid_pred))


Evaluation on the validation set:
processed 51362 tokens with 5942 phrases; found: 5917 phrases; correct: 5231.
accuracy:  87.25%; (non-O)
accuracy:  97.53%; precision:  88.41%; recall:  88.03%; FB1:  88.22
              LOC: precision:  89.83%; recall:  94.72%; FB1:  92.21  1937
             MISC: precision:  85.89%; recall:  77.87%; FB1:  81.68  836
              ORG: precision:  80.54%; recall:  76.21%; FB1:  78.31  1269
              PER: precision:  93.39%; recall:  95.06%; FB1:  94.22  1875


In [240]:
# evaluate on the test set
test_pred = []

BiLSTM_GloVe_model.eval()
with torch.no_grad():
  for data, label, len_data, len_label in testloader:
      pred, loss = BiLSTM_GloVe_model(data.to(device))

      pred = pred.cpu()
      pred = pred.detach().numpy()
      label = label.detach().numpy()
      pred = np.argmax(pred, axis=2)
      pred = pred.reshape((len(label), -1))[0]
      test_pred.append(pred.tolist())

# transform labels into NER tags
test_true = [
  list(map(idx2tag.get, labels))
  for labels in dataset['test']['labels']
]


test_pred = [
    list(map(idx2tag.get, pred))
    for pred in test_pred
]

print("Evaluation on the test set:")
precision, recall, f1 = evaluate(
  itertools.chain(*test_true),
  itertools.chain(*test_pred))

Evaluation on the test set:
processed 46435 tokens with 5648 phrases; found: 5652 phrases; correct: 4711.
accuracy:  84.60%; (non-O)
accuracy:  96.62%; precision:  83.35%; recall:  83.41%; FB1:  83.38
              LOC: precision:  84.08%; recall:  89.93%; FB1:  86.91  1784
             MISC: precision:  73.04%; recall:  69.09%; FB1:  71.01  664
              ORG: precision:  79.73%; recall:  75.56%; FB1:  77.59  1574
              PER: precision:  90.25%; recall:  90.97%; FB1:  90.61  1630


In [242]:
# torch.save(BiLSTM_GloVe_model.state_dict(), 'BiLSTM_GloVe_model.pt')