In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.optim import Adam

In [3]:
DATA_PATH = './data/'
DATA_TYPE = 'word'
PADDING_TOKEN = 0
UNKNOWN_TOKEN = 1
START_SENTENCE = 2
END_SENTENCE = 3
BATCH_SIZE = 64
LR = 0.01
DROPOUT = 0.1
EMBEDDING_SIZE = 200
HIDDEN_SIZE = 100

In [4]:
def read_data(data_path):
    sentences = []
    entities = []
    with open(data_path,'r',encoding='utf-8') as f:
        lines = f.readlines()
        sent = []
        tok = []
        for line in lines:
            tokens = line.strip().split()
            if len(tokens)==0:
                sentences.append(sent)
                entities.append(tok)
                sent=[]
                tok=[]
                continue
            if len(tokens)!=2:
                print(tokens)
                continue
            sent.append(tokens[0])
            tok.append(tokens[1])
    return sentences, entities

In [5]:
train_sentences, train_tokens = read_data(os.path.join(DATA_PATH, DATA_TYPE+'/train_'+DATA_TYPE+'.conll'))
val_sentences, val_tokens = read_data(os.path.join(DATA_PATH, DATA_TYPE+'/dev_'+DATA_TYPE+'.conll'))
test_sentences, test_tokens = read_data(os.path.join(DATA_PATH, DATA_TYPE+'/test_'+DATA_TYPE+'.conll'))

['bệnh']


In [18]:
class Vocab:
    def __init__(self):
        self.word2idx = {'<pad>': PADDING_TOKEN, '<unk>': UNKNOWN_TOKEN, '<s>': START_SENTENCE, '</s>': END_SENTENCE}
        self.idx2word = {value: key for key,value in self.word2idx.items()}
        self.length = 4
    def _add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.length
            self.idx2word[self.length] = word
            self.length+=1
    def _encode_sentence(self, sentence):
        return [self.word2idx.get(word, UNKNOWN_TOKEN) for word in sentence]
    def _decode_sentence(self, tokens):
        return [self.idx2word[token] for token in tokens]
    def build_vocab(self, sentences):
        for sent in sentences:
            for token in sent:
                self._add_word(token)
    def encode(self, sentences):
        return [self._encode_sentence(sent) for sent in sentences]
    def __len__(self):
        return self.length
    def __call__(self, word):
        return self.word2idx[word]

In [19]:
vocab = Vocab()
vocab.build_vocab(train_sentences)

In [15]:
class NER_Dataset(Dataset):
    def __init__(self, sentences, tokens):
        self.sentences = sentences
        self.tokens = tokens
    def __len__(self):
        return len(self.tokens)
    def __getitem__(self, index):
        return self.sentences[index], self.tokens[index]

In [32]:
class EntityVocab:
    def __init__(self):
        self.token2idx = {}
        self.idx2token = {}
        self.length = 0
    def _add_token(self, token):
        if token not in self.token2idx:
            self.token2idx[token] = self.length
            self.idx2token[self.length] = token
            self.length+=1
    def _encode_token(self, tokens):
        return [self.token2idx.get(token, 0) for token in tokens]
    def _decode_token(self, tokens):
        return [self.idx2token[token] for token in tokens]
    def encode(self, tokens):
        return [self._encode_token(token) for token in tokens]
    def build(self,tokens):
        for tok in tokens:
            for token in tok:
                self._add_token(token)
    def __len__(self):
        return self.length
    def __call__(self, token):
        return self.token2idx[token] 

In [33]:
entity_vocab = EntityVocab()
entity_vocab.build(train_tokens)

In [48]:
train_sentences_encoded, train_entities_encoded = vocab.encode(train_sentences), entity_vocab.encode(train_tokens)
val_sentences_encoded, val_entities_encoded = vocab.encode(val_sentences), entity_vocab.encode(val_tokens)
test_sentences_encoded, test_entities_encoded = vocab.encode(test_sentences), entity_vocab.encode(test_tokens)

In [49]:
trainset = NER_Dataset(train_sentences_encoded, train_entities_encoded)
valset = NER_Dataset(val_sentences_encoded, val_entities_encoded)
testset = NER_Dataset(test_sentences_encoded, test_entities_encoded)

In [45]:
def _collate_fn(batch):
    sentences, entities = zip(*batch)
    max_length = max([len(sentence) for sentence in sentences])
    padding_sentences = []
    padding_entities = []
    for sentence, entity in zip(sentences, entities):
        padding_sentences.append(sentence+[PADDING_TOKEN for j in range(max_length-len(sentence))])
        padding_entities.append(entity+[PADDING_TOKEN for j in range(max_length-len(sentence))])
    sentences = torch.LongTensor(padding_sentences)
    entities = torch.LongTensor(padding_entities)
    masks = sentences!=PADDING_TOKEN
    return sentences, masks, entities    

In [50]:
sentence, entity = trainset[0]

In [51]:
sentence, entity

([4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0])

In [52]:
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=_collate_fn)
val_loader = DataLoader(valset, batch_size=BATCH_SIZE, collate_fn=_collate_fn)
test_loader = DataLoader(testset, batch_size=BATCH_SIZE, collate_fn=_collate_fn)

In [83]:
class NER_BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size, dropout=0.1, n_layers=1) -> None:
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size

        self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=PADDING_TOKEN)
        self.lstm = nn.LSTM(input_size = emb_size, hidden_size = hidden_size//2, num_layers = n_layers, batch_first = True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.hidden2tag = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)
    def forward(self, data, masks):
        emb = self.embeddings(data)
        output, hidden = self.lstm(emb) 
        output = self.dropout(output)
        output = self.hidden2tag(output)
        output = self.softmax(output)
        return output.permute(0, 2, 1)

In [57]:
from collections import Counter
counter = Counter(token for tokens in train_entities_encoded for token in tokens)

Counter({0: 104750,
         1: 1137,
         2: 2545,
         3: 1439,
         4: 1552,
         5: 5398,
         6: 2549,
         7: 3240,
         8: 682,
         9: 349,
         10: 2500,
         11: 205,
         12: 5242,
         13: 226,
         14: 542,
         15: 67,
         16: 62,
         17: 13,
         18: 2,
         19: 11})

In [61]:
def create_class_weight(labels_dict):
  total = np.sum(list(labels_dict.values()))
  keys  = labels_dict.keys()
  class_weight = dict()
  num_classes = len(labels_dict)
  for key in keys:
      score = round(total / (num_classes * labels_dict[key]+total/10), 2)
      class_weight[key] = score
  return class_weight

In [109]:
class_weight = create_class_weight(counter)
#criterion = nn.CrossEntropyLoss(torch.FloatTensor(list(class_weight.values())), reduction='none')
criterion = nn.CrossEntropyLoss(reduction='none')
model = NER_BiLSTM(len(vocab), EMBEDDING_SIZE, HIDDEN_SIZE, len(entity_vocab), dropout=0.15, n_layers=1)
optimizer = Adam(model.parameters(), lr=LR)

In [71]:
def reset_logger(logger):
  for handler in logger.handlers[:]:
    logger.removeHandler(handler)

  for f in logger.filters[:]:
    logger.removeFilters(f)

In [72]:
import logging
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)

fileHandler = logging.FileHandler("{0}/{1}.log".format('./', f'experiments_{type(model).__name__}'), mode = 'w')
fileHandler.setFormatter(logFormatter)
rootLogger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
rootLogger.addHandler(consoleHandler)

rootLogger.info("Experiment start")

2022-01-24 23:06:38,081 [MainThread  ] [INFO ]  Experiment start


In [95]:
from tqdm import tqdm
def train_epoch(model, criterion, optimizer, dataset, epoch):
    model.train()
    rootLogger.info(f"-----------------------Epoch {epoch}--------------------")
    epoch_loss = []
    for batch in tqdm(dataset, total=len(dataset), desc = 'Train epoch %s'%epoch):
        optimizer.zero_grad()
        data, masks, labels = batch
        out = model(data, masks)
        length = torch.sum(masks)
        loss = torch.sum(criterion(out, labels)*masks)/length
        epoch_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    return sum(epoch_loss)/len(epoch_loss)

In [103]:
def eval(model, criterion, dataset):
    model.eval()
    val_loss = []
    true_pred =  []
    ground_truth = []
    for batch in tqdm(dataset, total=len(dataset), desc= "Validation"):
        data, masks, labels = batch
        out = model(data, masks)
        length = torch.sum(masks)
        loss = torch.sum(criterion(out, labels)*masks)/length
        val_loss.append(loss.item())
        prediction = torch.argmax(out, dim=1)
        true_pred.append(torch.sum((prediction==labels)*masks))
        ground_truth.append(length)
    return sum(val_loss)/len(val_loss), sum(true_pred)/sum(ground_truth)

In [98]:
def save_model(model, optimizer):
    save_point = {
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    save_file = f'{type(model).__name__}.pt'
    torch.save(save_point, save_file)

In [99]:
def train(model, criterion, optimizer, trainset, valset, epochs):
    rootLogger.info("###Start training####")
    best_acc = 0
    for epoch in range(epochs):
        train_loss = train_epoch(model, criterion, optimizer, trainset, epoch)
        val_loss, val_acc = eval(model, criterion, valset)
        rootLogger.info(f"Train loss: {train_loss:.3f}\t Val loss: {val_loss:.3f}\t Val acc: {val_acc:.3f}")
        if val_acc>best_acc:
            best_acc = val_acc
            save_model(model, optimizer)

In [111]:
train(model, criterion, optimizer, train_loader, val_loader, 5)

2022-01-24 23:53:19,562 [MainThread  ] [INFO ]  ###Start training####
2022-01-24 23:53:19,564 [MainThread  ] [INFO ]  -----------------------Epoch 0--------------------
Train epoch 0: 100%|██████████| 79/79 [00:08<00:00,  9.52it/s]
Validation: 100%|██████████| 32/32 [00:01<00:00, 21.92it/s]
2022-01-24 23:53:29,332 [MainThread  ] [INFO ]  Train loss: 0.018	 Val loss: 0.164	 Val acc: 0.961
2022-01-24 23:53:29,349 [MainThread  ] [INFO ]  -----------------------Epoch 1--------------------
Train epoch 1: 100%|██████████| 79/79 [00:08<00:00,  9.28it/s]
Validation: 100%|██████████| 32/32 [00:01<00:00, 22.80it/s]
2022-01-24 23:53:39,279 [MainThread  ] [INFO ]  Train loss: 0.014	 Val loss: 0.169	 Val acc: 0.961
2022-01-24 23:53:39,295 [MainThread  ] [INFO ]  -----------------------Epoch 2--------------------
Train epoch 2: 100%|██████████| 79/79 [00:08<00:00,  9.24it/s]
Validation: 100%|██████████| 32/32 [00:01<00:00, 21.71it/s]
2022-01-24 23:53:49,325 [MainThread  ] [INFO ]  Train loss: 0.011	

In [112]:
checkpoint = torch.load('NER_BiLSTM.pt')
model.load_state_dict(checkpoint['model'])
eval(model, criterion, val_loader)

Validation: 100%|██████████| 32/32 [00:01<00:00, 24.42it/s]


(0.16862980474252254, tensor(0.9612))

In [None]:
class NER_BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size, dropout=0.1, n_layers=1) -> None:
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size

        self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=PADDING_TOKEN)
        self.lstm = nn.LSTM(input_size = emb_size, hidden_size = hidden_size, num_layers = n_layers, batch_first = True)
        self.dropout = nn.Dropout(dropout)
        self.hidden2tag = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)
    def forward(self, data, masks):
        emb = self.embeddings(data)
        output, hidden = self.lstm(emb) 
        output = self.dropout(output)
        output = self.hidden2tag(output)
        output = self.softmax(output)
        return output.permute(0, 2, 1)