## 0- Import Packages

In [12]:
! pip install wget



In [45]:
import os
import re
import math
import time
import pickle
from collections import Counter

import wget
import nltk
from typing import List
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [46]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [14]:
# check available device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Running on", device)

Running on cuda


In [15]:
# define constants
MENTION_OPEN_TOKEN = '<men>'
MENTION_CLOSE_TOKEN = '</men>'
UNK_TOKEN = '<unk>'
PAD_TOKEN = '<pad>'

BATCH_SIZE = 32
TEST_BATCH_SIZE = 2

SEED = 42

RETRAIN = True
MODEL_NAME = 'entity_parser'

In [16]:
# fix seed for reproduceability
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

## 1- Read data files & Preprocess

In [17]:
def read_data(path):
    with open(path,'r',encoding='UTF-8') as f:
        raw_data = f.readlines()
    return raw_data

In [18]:
def identify_mention_in_sent(entity: List[str], sentence: List[str]) -> List[str]:
    start_index = sentence.index(entity[0])
    sentence[start_index:start_index] = [MENTION_OPEN_TOKEN]
    end_index = sentence.index(entity[-1])+1
    sentence[end_index:end_index] = [MENTION_CLOSE_TOKEN]
    return sentence

In [19]:
def identify_mention_test(entity: List[str], sentence: List[str]) -> List[str]:
    index = [sentence.index(token) for token in sentence if entity[0] in token]
    start_index, end_index = index[0], index[0] + len(entity) + 1
    sentence[start_index:start_index] = [MENTION_OPEN_TOKEN]
    sentence[end_index:end_index] = [MENTION_CLOSE_TOKEN]
    return sentence

In [20]:
def preprocess_data(dataset: List[str], mode: str):
    X = []
    Y = []
    excluded_training_samples_count = 0
    if mode == 'train':
        for example in dataset:   
            try:
                entity, types, sentence =  example.split('\t')
                entity = re.sub(u'[^A-Za-z0-9]+',' ', entity.lower()).split()
                types = re.sub(u'[^A-Za-z,\s]+','', types.lower()).split(', ')
                sentence = re.sub(u'[^A-Za-z0-9]+',' ', sentence.lower()).split()

                # add special tokens to sentence
                sentence = identify_mention_in_sent(entity, sentence)
                X.append(sentence)
                Y.append(types)
            except Exception as e:
                excluded_training_samples_count += 1
        print('Failed to identify named entity in train data:', excluded_training_samples_count/(len(dataset)))
        return X, Y
    else:
        for example in dataset:  
            _, entity, sentence =  example.split('\t')
            entity = re.sub(u'[^A-Za-z0-9]+',' ', entity.lower()).split()
            sentence = re.sub(u'[^A-Za-z0-9]+',' ', sentence.lower()).split()

            # add special tokens to sentence
            sentence = identify_mention_test(entity, sentence)
            X.append(sentence)
        return X

In [21]:
git_path = 'https://raw.githubusercontent.com/tsimafeip/LCT-master-course/main/Knowledge_Bases/Lab_03/'

# read train file
train_file = 'train.tsv'
if not os.path.isfile(train_file):
    wget.download(git_path+train_file, train_file)

train_data = read_data(train_file)
train_sents, train_types = preprocess_data(train_data, 'train')

# read test file
test_file = 'test.tsv'
if not os.path.isfile(test_file):
    wget.download(git_path+test_file, test_file)

test_data = read_data(test_file)
test_sents = preprocess_data(test_data, 'test')

Failed to identify named entity in train data: 0.002


## 2- Build source Vocab and target classes

In [22]:
class Vocab():
    def __init__(self, sentences: List[str]):
        self.vocab = self.build_vocab(sentences)
        self.unk_index = self.vocab.index(UNK_TOKEN)
        self.vocab_idx = self.build_tokens_to_ids_dict()
        self.idx_vocab = self.build_ids_to_tokens_dict()
        
    def build_vocab(self, sents: List[str]) -> List[str]:
        all_tokens = [token for sent in sents for token in sent]
        vocab = [PAD_TOKEN, UNK_TOKEN, MENTION_OPEN_TOKEN, MENTION_CLOSE_TOKEN]
        # add all unique tokens
        vocab.extend([token for token in Counter(all_tokens) if token not in vocab])

        return vocab
    
    def build_tokens_to_ids_dict(self):
        vocab_idx = {token: i for i, token in enumerate(self.vocab)}
        return vocab_idx
    
    def build_ids_to_tokens_dict(self):
        idx_vocab = {i: self.vocab[i] for i in range(len(self.vocab))}
        return idx_vocab
    
    def look_up_indices(self, sentence: List[str]):
        return [self.vocab_idx.get(token, self.unk_index) for token in sentence]
        
    def look_up_tokens(self, indices: List[int]):
        return [self.idx_vocab.get(i, UNK_TOKEN) for i in indices]
    
    def __len__(self):
        return len(self.vocab)

In [23]:
input_vocab = Vocab(train_sents)
len(input_vocab)

51739

In [24]:
# with open('vocab','wb') as ff:
#     pickle.dump(input_vocab, ff)

In [25]:
flat_types = [ty for train_type in train_types for ty in train_type]
unique_types = Counter(flat_types)

labels = dict(zip(unique_types, range(len(unique_types))))
inv_labels = {index: label for label, index in labels.items()}

In [26]:
sample_index = 0

print(labels[inv_labels[sample_index]], inv_labels[sample_index], len(labels))

0 secondary school 3188


In [27]:
# with open('inv_labels', 'wb') as ff:
#     pickle.dump(inv_labels, ff)

In [28]:
for i in range(len(train_types)):
    train_types[i] = [labels[ty] for ty in train_types[i]]

In [29]:
train_types[1000]

[304, 474, 475]

## 3- Create Datasets & DataLoaders

In [30]:
class EntityTypingTrainSet(Dataset):
    def __init__(self, sents: List[List[str]], types: List[List[int]]):
        self.src = sents
        self.trgt = types
        
    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, index: int):
        return self.src[index], self.trgt[index]

class EntityTypingTestSet(Dataset):
    def __init__(self, sents: List[List[str]]):
        self.src = sents
        
    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, index: int):
        return self.src[index]

In [31]:
# pad examples in the batch to be the same length 
# NB: COLLATE FUNCTIONS HAVE ACCESS TO INPUT_VOCAB AND LABELS
def entity_train_collate_fn(train_batch):
    x, y = zip(*train_batch)
    src_seqs_len = [len(x_item) for x_item in x]
    # get max length for both source and targets 
    src_max_len = max(src_seqs_len)
    
    examples = torch.zeros((len(train_batch), src_max_len))
    targets = torch.zeros((len(train_batch), len(labels)))

    # pad shorter examples to max length for both source and target
    for i in range(len(train_batch)):
        src = torch.tensor(input_vocab.look_up_indices(train_batch[i][0]))
        j = src.size(0)
        examples[i] = torch.cat([src, torch.zeros((src_max_len - j))])
        targets[i][y[i]] = 1

    return examples.long(), targets.long()
        

def entity_test_collate_fn(test_batch):
    src_seqs_len = [len(x_item) for x_item in test_batch]
    
    # get max length for both source and targets 
    src_max_len = max(src_seqs_len)
    
    examples = torch.zeros((len(test_batch), src_max_len))

    # pad shorter examples to max length for both source and target
    for i in range(len(test_batch)):
        src = torch.tensor(input_vocab.look_up_indices(test_batch[i]))
        j = src.size(0)
        examples[i] = torch.cat([src, torch.zeros((src_max_len - j))])

    return examples.long()


def build_dataLoader(src_sentences, trgt_labels, mode):
    if mode == 'train':
        # create dataset and data loader
        types_dataset = EntityTypingTrainSet(src_sentences, trgt_labels)
        types_dataLoader = DataLoader(types_dataset, batch_size=BATCH_SIZE, collate_fn=entity_train_collate_fn, shuffle=True)
        return types_dataLoader
    else:
        # create dataset and data loader
        types_dataset = EntityTypingTestSet(src_sentences)
        types_dataLoader = DataLoader(types_dataset, batch_size=TEST_BATCH_SIZE, collate_fn=entity_test_collate_fn)
        return types_dataLoader
    

def all_dataloaders(src_train, trgt_train, src_test):    
    train_data_loader = build_dataLoader(src_train, trgt_train, 'train')
    test_data_loader = build_dataLoader(src_test, [], 'test')
    
    return train_data_loader, test_data_loader 

In [32]:
train_data_loader, test_data_loader = all_dataloaders(train_sents, train_types, test_sents)

In [33]:
for batch in train_data_loader:
    #x, y, _ = batch
    print(len(batch), batch[0].size(), batch[1].size())
    break

for batch in test_data_loader:
    #x, y, _ = batch
    print(len(batch))
    break
    

2 torch.Size([32, 39]) torch.Size([32, 3188])
2


## Model

In [34]:
class PositionalEncoding(nn.Module):
    '''source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html'''
    def __init__(self, d_model, dropout, max_len: int = 500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [35]:
class EntityTyper(nn.Module):
    '''
    Class definition has mostly taken from https://pytorch.org/tutorials/beginner/transformer_tutorial.html.
    Transformer decoder was replaced with a single linear classification layer.
    '''
    def __init__(self, vocab_size: int, num_classes: int, model_dim: int,
                 num_heads: int, ff_hid_dim: int, num_layers: int,
                 dropout_rate: float):
      
        super().__init__()
        self.d_model = model_dim
        self.pos_encoder = PositionalEncoding(model_dim, dropout_rate)
        self.embedding = nn.Embedding(vocab_size, model_dim)
        encoder_layers = nn.TransformerEncoderLayer(model_dim, num_heads, ff_hid_dim, dropout_rate, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.linear = nn.Linear(model_dim, num_classes)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, pad_mask):
        # x and x_mask shape: [BATCH_SIZE, PADDED_SEQ_LEN]
        assert x.shape == pad_mask.shape

        # [BATCH_SIZE, PADDED_SEQ_LEN, MODEL_DIM]
        embedded_input = self.embedding(x) * math.sqrt(self.d_model)

        # [BATCH_SIZE, PADDED_SEQ_LEN, MODEL_DIM]
        embedded_input_with_positions = self.pos_encoder(embedded_input)

        # [BATCH_SIZE, PADDED_SEQ_LEN, MODEL_DIM]
        transformer_encoder_output = self.transformer_encoder(embedded_input_with_positions, None, pad_mask)

        # find average vector for all prediction vectors to create a single vector for sentence
        # [BATCH_SIZE, MODEL_DIM]
        transformer_output = torch.div(torch.sum(transformer_encoder_output, dim=1), transformer_encoder_output.size(1))
        
        # transform model_dim to num_classes
        # [BATCH_SIZE, NUM_CLASSES]
        predictions = self.linear(transformer_output)
        return predictions
    
    def train_model(self, dataloader, criterion, optimizer, epoch) -> float:
        self.train()
        total_loss = 0
        print('==================================================')
        print('Epoch: {} started'.format(epoch+1))
        for batch in tqdm(dataloader):
            x, y = batch
            
            # move batches to device
            x = x.to(device)
            y = y.to(device)

            # create masks & move masks to device
            x_pad_mask = (x == 0).to(device)

            # training
            self.zero_grad()
            out = model(x, x_pad_mask)
            loss = criterion(out, y.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print('Epoch: {} - Train Loss: {:.6f}'.format(epoch+1, total_loss))
        return total_loss
      
    def predict_types(self, dataloader):
        with torch.no_grad():
            model.eval()
            for batch in tqdm(dataloader):
                x = batch

                # move batches to device
                x = x.to(device)

                # create masks & move masks to device
                x_pad_mask = (x == 0).to(device)

                # inference
                predictions = model(x, x_pad_mask) 
                for seq, pred_for_seq in zip(x, predictions):
                    max_value, max_index = torch.max(pred_for_seq, -1) # get max value in the logit
                    predicted_y = max_index.item()
                    # test batch size is 1
                    yield seq, predicted_y

In [36]:
# Hyperparameters
VOCAB_SIZE = len(input_vocab)
NUM_CLASSES = len(labels)

MODEL_DIM = 512
FF_HID_DIM = MODEL_DIM * 2
NUM_HEADS = 4 
NUM_LAYERS = 6
DROPOUT_RATE = 0.5
LEARNING_RATE = 0.0001
EPOCHS = 25

CRITERION = nn.BCEWithLogitsLoss()
OPTIMIZER_TYPE = torch.optim.Adam

In [37]:
model = EntityTyper(vocab_size=VOCAB_SIZE,
                    num_classes=NUM_CLASSES,
                    model_dim=MODEL_DIM,
                    num_heads=NUM_HEADS,
                    ff_hid_dim=FF_HID_DIM,
                    num_layers=NUM_LAYERS,
                    dropout_rate=DROPOUT_RATE).to(device)

In [38]:
# example batch and model forward
for batch in train_data_loader:
    x, y = batch
    x_pad_mask = (x == 0).to(device)
    
    print(input_vocab.look_up_tokens(x[0].tolist()))
    print([index for index in range(len(y[0])) if y[0][index] == 1])

    x = x.to(device)
    y = y.to(device)
    out = model(x, x_pad_mask)

    print(out.size())
    print(y.size())
    loss = CRITERION(out, y.float())
    break

['<men>', 'shahrak', 'e', 'saqaveh', '</men>', 'persian', 'also', 'romanized', 'as', 'shahrak', 'e', 'saq', 'veh', 'also', 'known', 'as', 'sagawah', 'sagaweh', 'saq', 'veh', 'seh', 'g', 'veh', 'and', 'seq', 'veh', 'is', 'a', 'village', 'in', 'margown', 'rural', 'district', 'margown', 'district', 'boyer', 'ahmad', 'county', 'kohgiluyeh', 'and', 'boyer', 'ahmad', 'province', 'iran', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[2]
torch.Size([32, 3188])
torch.Size([32, 3188])


## Model Training

In [None]:
if RETRAIN:
    optimizer = OPTIMIZER_TYPE(model.parameters(), lr=LEARNING_RATE)

    start = time.time()
    best_loss = float('inf')
    
    for epoch in range(EPOCHS):
        epoch_loss = model.train_model(dataloader=train_data_loader, 
                                       criterion=CRITERION, 
                                       optimizer=optimizer, 
                                       epoch=epoch)
        if epoch_loss < best_loss:
            torch.save(model, MODEL_NAME)
            best_loss = epoch_loss

    end = time.time()
    print(f'Training time = {end - start} seconds.')
else:
    model = torch.load(MODEL_NAME, map_location=torch.device('cpu'))

## Inference

In [47]:
preds = []

with open('results.tsv', 'w') as f:
    for i, (token_indices, pred_label_index) in enumerate(model.predict_types(test_data_loader)):
        test_sample_index = i+1
        # we need to output it as list, since multiple type prediction is possible
        predicted_types = [inv_labels[pred_label_index]]

        preds.append((input_vocab.look_up_tokens(token_indices.tolist()), inv_labels[pred_label_index]))

        f.write(f'{test_sample_index}\t{predicted_types}\n')

100%|██████████| 1000/1000 [00:09<00:00, 109.03it/s]


In [41]:
for i in range(5):
   print(preds[i])

(['<men>', 'jan', '<unk>', '</men>', '1877', '1961', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], 'composer')
(['the', '<men>', 'eleventh', 'air', 'force', '</men>', '11', 'af', 'is', 'a', 'numbered', 'air', 'force', 'of', 'the', 'united', 'states', 'air', 'force', 'pacific', 'air', 'forces', '<unk>'], 'military unit')
(['<men>', '<unk>', '<unk>', '</men>', '<unk>', 'february', '4', '1916', 'december', '28', '1992', 'was', 'a', 'canadian', 'inuit', 'artist', 'whose', 'preferred', 'medium', 'was', 'a', 'combination', 'of', '<unk>', 'wash', 'and', 'coloured', '<unk>'], 'artist')
(['<men>', 'taken', 'the', 'search', 'for', 'sophie', 'parker', '</men>', 'is', 'a', '2013', 'american', 'made', 'for', 'television', 'film', 'directed', 'by', 'don', 'michael', 'paul', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], 'film')
(['<men>', 'francisco', '<unk>', '</men>', '29', 

In [48]:
# read train file
test_groundtruth_file = 'test-groundtruth.tsv'
if not os.path.isfile(test_groundtruth_file):
    wget.download(git_path+test_groundtruth_file, test_groundtruth_file)

evaluation_code = 'evaluate.py'
if not os.path.isfile(evaluation_code):
    wget.download(git_path+evaluation_code, evaluation_code)

! python evaluate.py results.tsv test-groundtruth.tsv

Strict: Using exact matching:
	Macro Precision, Recall and F1:	0.606	0.5484484126984132	0.5757896748601857
	Micro Precision, Recall and F1:	0.606	0.48635634028892455	0.5396260017809439
Loose: Using exact matching on the lemma of the head-word of the type:
	Macro Precision, Recall and F1:	0.663	0.6098755952380952	0.6353292044494304
	Micro Precision, Recall and F1:	0.663	0.5515806988352745	0.6021798365122616
