<a href="https://colab.research.google.com/github/vincentjordan27/Named-Entity-Recognition-BILSTM-CRF/blob/main/NER_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Mounted at /content/gdrive


In [6]:
!pip install torchtext==0.6.0

import os
import time
import gensim
from collections import Counter
import torch
from torch import nn
from torch.optim import Adam
from torchtext.data import Field, BucketIterator
from torchtext.datasets import SequenceTaggingDataset
from torchtext.vocab import Vocab
from spacy.lang.id import Indonesian

DRIVE_ROOT = "/content/gdrive/My Drive/Dataset/Input"



## Corpus

Untuk corpus preperation kami menggunakan torchtext dan word embedding dengan Word2Vec

In [3]:
class Corpus(object):

    def __init__(self, input_folder, min_word_freq, batch_size, wv_file=None):
        # list all the fields
        self.word_field = Field(lower=True)
        self.tag_field = Field(unk_token=None)
        # create dataset using built-in parser from torchtext
        self.train_dataset, self.val_dataset, self.test_dataset = SequenceTaggingDataset.splits(
            path=input_folder,
            train="train.tsv",
            validation="val.tsv",
            test="test.tsv",
            fields=(("word", self.word_field), ("tag", self.tag_field))
        )
        ### BEGIN MODIFIED SECTION: WORD EMBEDDING ###
        if wv_file:
            # retrieve word2vec model from gensim library
            # the file contains full word2vec model, not only key-vectors
            self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file)
            self.embedding_dim = self.wv_model.vector_size
            # cannot create vocab with build_vocab(),
            # initiate vocab by building custom Counter based on word2vec model
            word_freq = {word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab}
            word_counter = Counter(word_freq)
            self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq)
            # mapping each vector/embedding from word2vec model to word_field vocabs
            vectors = []
            for word, idx in self.word_field.vocab.stoi.items():
                if word in self.wv_model.wv.vocab.keys():
                    vectors.append(torch.as_tensor(self.wv_model.wv[word].tolist()))
                else:
                    vectors.append(torch.zeros(self.embedding_dim))
            self.word_field.vocab.set_vectors(
                stoi=self.word_field.vocab.stoi,
                # list of vector embedding, orderred according to word_field.vocab
                vectors=vectors,
                dim=self.embedding_dim
            )
        else:
            self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq)
        ### END MODIFIED SECTION ###
        # build vocab for tag
        self.tag_field.build_vocab(self.train_dataset.tag)
        # create iterator for batch input
        self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits(
            datasets=(self.train_dataset, self.val_dataset, self.test_dataset),
            batch_size=batch_size
        )
        # prepare padding index to be ignored during model training/evaluation
        self.word_pad_idx = self.word_field.vocab.stoi[self.word_field.pad_token]
        self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]

In [9]:
corpus = Corpus(
    input_folder=f"{DRIVE_ROOT}/",
    min_word_freq=3,
    batch_size=64,
    wv_file=f"{DRIVE_ROOT}/embeddings/id_ft.bin"
)
print(f"Train set: {len(corpus.train_dataset)} sentences")
print(f"Val set: {len(corpus.val_dataset)} sentences")
print(f"Test set: {len(corpus.test_dataset)} sentences")

Train set: 3535 sentences
Val set: 470 sentences
Test set: 468 sentences


Let's take a look at the loaded word2vec model.

In [10]:
wv_shape = corpus.wv_model.wv.vectors.shape
print(f"The model was trained on {corpus.wv_model.corpus_count:,} words.")
print(f"The embedding represents {wv_shape[0]:,} unique words with vectors of size {wv_shape[1]}.")
print("The 5 most similar words to 'pemberhentian': " + ", ".join([word for word, prob in corpus.wv_model.wv.most_similar("pemberhentian",topn=5)]) + ".")
print("An example of embeddings for the word 'pemberhentian' (which does not exist in the training set):")
print(corpus.wv_model.wv["pemberhentian"])

The model was trained on 2,988,507 words.
The embedding represents 30,048 unique words with vectors of size 300.
The 5 most similar words to 'pemberhentian': pengangkatan, pemecatan, penunjukkan, memberhentikan, diberhentikan.
An example of embeddings for the word 'pemberhentian' (which does not exist in the training set):
[ 8.51707399e-01 -4.16559912e-02  6.00995362e-01  5.37340283e-01
  9.33561563e-01 -4.98508543e-01  1.66677666e+00 -4.50234622e-01
 -1.04996562e+00 -1.16242014e-01 -9.65069592e-01 -5.75939476e-01
  1.31526738e-01 -9.09028769e-01  1.10143505e-01 -9.74609315e-01
 -5.14446139e-01 -7.96002030e-01  9.62427914e-01 -1.68779945e+00
 -2.55186737e-01  1.85162282e+00 -1.72700420e-01 -5.72440922e-01
  2.40831465e-01  3.36628050e-01 -2.71441191e-01 -1.88790828e-01
  1.02529240e+00  3.33210170e-01  2.38483861e-01 -5.34359038e-01
  2.23073304e-01 -1.43152475e+00  9.19080257e-01  6.59672499e-01
 -6.07612550e-01 -2.71360338e-01  1.97121471e-01 -1.79148242e-01
  2.39485770e-01  1.60026

In [11]:
# An example on how to access a vector
id_pemberhentian = corpus.word_field.vocab.stoi["pemberhentian"]
print(f"Index for 'pemberhentian': {id_pemberhentian}")
vector_pemberhentian = corpus.word_field.vocab.vectors[id_pemberhentian]
print(
    "Vector for 'pemberhentian' should be identical with the vector from the word2vec model: " 
    + str(torch.equal(vector_pemberhentian, torch.as_tensor(corpus.wv_model.wv["pemberhentian"])))
    )

Index for 'pemberhentian': 8782
Vector for 'pemberhentian' should be identical with the vector from the word2vec model: True


  import sys


In [12]:
# the initializer for Vocab provides special token handling ('<pad>', '<unk>')
print(f"index for pad token: {corpus.word_field.vocab.stoi[corpus.word_field.pad_token]}")
print(f"index for unk token: {corpus.word_field.vocab.stoi[corpus.word_field.unk_token]}")
print("vector for index 0:")
print(corpus.word_field.vocab.vectors[0])

index for pad token: 1
index for unk token: 0
vector for index 0:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

## Model

In [13]:
class BiLSTM(nn.Module):

    def __init__(self,
                 input_dim,
                 embedding_dim,
                 hidden_dim,
                 output_dim,
                 lstm_layers,
                 emb_dropout,
                 lstm_dropout,
                 fc_dropout,
                 word_pad_idx):
        super().__init__()
        self.embedding_dim = embedding_dim
        # LAYER 1: Embedding
        self.embedding = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=embedding_dim,
            padding_idx=word_pad_idx
        )
        self.emb_dropout = nn.Dropout(emb_dropout)
        # LAYER 2: BiLSTM
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            dropout=lstm_dropout if lstm_layers > 1 else 0
        )
        # LAYER 3: Fully-connected
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # times 2 for bidirectional

    def forward(self, sentence):
        # sentence = [sentence length, batch size]
        # embedding_out = [sentence length, batch size, embedding dim]
        embedding_out = self.emb_dropout(self.embedding(sentence))
        # lstm_out = [sentence length, batch size, hidden dim * 2]
        lstm_out, _ = self.lstm(embedding_out)
        # ner_out = [sentence length, batch size, output dim]
        ner_out = self.fc(self.fc_dropout(lstm_out))
        return ner_out

    def init_weights(self):
        # to initialize all parameters from normal distribution
        # helps with converging during training
        for name, param in self.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.1)

    ### BEGIN MODIFIED SECTION: WORD EMBEDDING ###
    def init_embeddings(self, word_pad_idx, pretrained=None, freeze=True):
        # initialize embedding for padding as zero
        self.embedding.weight.data[word_pad_idx] = torch.zeros(self.embedding_dim)
        if pretrained is not None:
            # use built in function: from pretrained
            # specify if embedding layer is trainable with the `freeze` param
            self.embedding = nn.Embedding.from_pretrained(
                embeddings=torch.as_tensor(pretrained),
                padding_idx=word_pad_idx,
                freeze=freeze
            )
    ### END MODIFIED SECTION ###

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)


In [14]:
bilstm = BiLSTM(
    input_dim=len(corpus.word_field.vocab),
    embedding_dim=300,
    hidden_dim=64,
    output_dim=len(corpus.tag_field.vocab),
    lstm_layers=2,
    emb_dropout=0.5,
    lstm_dropout=0.1,
    fc_dropout=0.25,
    word_pad_idx=corpus.word_pad_idx
)
bilstm.init_weights()
### BEGIN MODIFIED SECTION: WORD EMBEDDING ###
bilstm.init_embeddings(
    word_pad_idx=corpus.word_pad_idx,
    pretrained=corpus.word_field.vocab.vectors if corpus.wv_model else None,
    freeze=True
)
### END MODIFIED SECTION ###
print(f"The model has {bilstm.count_parameters():,} trainable parameters.")
print(bilstm)

The model has 289,558 trainable parameters.
BiLSTM(
  (embedding): Embedding(30050, 300, padding_idx=1)
  (emb_dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(300, 64, num_layers=2, dropout=0.1, bidirectional=True)
  (fc_dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=128, out_features=22, bias=True)
)


## Trainer

In [16]:
class Trainer(object):

    def __init__(self, model, data, optimizer_cls, loss_fn_cls):
        self.model = model
        self.data = data
        self.optimizer = optimizer_cls(model.parameters())
        self.loss_fn = loss_fn_cls(ignore_index=self.data.tag_pad_idx)

    @staticmethod
    def epoch_time(start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def accuracy(self, preds, y):
        max_preds = preds.argmax(dim=1, keepdim=True)  # get the index of the max probability
        non_pad_elements = (y != self.data.tag_pad_idx).nonzero()  # prepare masking for paddings
        correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
        return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

    def epoch(self):
        epoch_loss = 0
        epoch_acc = 0
        self.model.train()
        for batch in self.data.train_iter:
            # text = [sent len, batch size]
            text = batch.word
            # tags = [sent len, batch size]
            true_tags = batch.tag
            self.optimizer.zero_grad()
            pred_tags = self.model(text)
            # to calculate the loss and accuracy, we flatten both prediction and true tags
            # flatten pred_tags to [sent len, batch size, output dim]
            pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
            # flatten true_tags to [sent len * batch size]
            true_tags = true_tags.view(-1)
            batch_loss = self.loss_fn(pred_tags, true_tags)
            batch_acc = self.accuracy(pred_tags, true_tags)
            batch_loss.backward()
            self.optimizer.step()
            epoch_loss += batch_loss.item()
            epoch_acc += batch_acc.item()
        return epoch_loss / len(self.data.train_iter), epoch_acc / len(self.data.train_iter)

    def evaluate(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        self.model.eval()
        with torch.no_grad():
            # similar to epoch() but model is in evaluation mode and no backprop
            for batch in iterator:
                text = batch.word
                true_tags = batch.tag
                pred_tags = self.model(text)
                pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
                true_tags = true_tags.view(-1)
                batch_loss = self.loss_fn(pred_tags, true_tags)
                batch_acc = self.accuracy(pred_tags, true_tags)
                epoch_loss += batch_loss.item()
                epoch_acc += batch_acc.item()
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    def train(self, n_epochs):
        for epoch in range(n_epochs):
            start_time = time.time()
            train_loss, train_acc = self.epoch()
            end_time = time.time()
            epoch_mins, epoch_secs = Trainer.epoch_time(start_time, end_time)
            print(f"Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
            print(f"\tTrn Loss: {train_loss:.3f} | Trn Acc: {train_acc * 100:.2f}%")
            val_loss, val_acc = self.evaluate(self.data.val_iter)
            print(f"\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc * 100:.2f}%")
        test_loss, test_acc = self.evaluate(self.data.test_iter)
        print(f"Test Loss: {test_loss:.3f} |  Test Acc: {test_acc * 100:.2f}%")

    def infer(self, sentence, true_tags=None):
        self.model.eval()
        # tokenize sentence
        nlp = Indonesian()
        tokens = [token.text.lower() for token in nlp(sentence)]
        # transform to indices based on corpus vocab
        numericalized_tokens = [self.data.word_field.vocab.stoi[t] for t in tokens]
        # find unknown words
        unk_idx = self.data.word_field.vocab.stoi[self.data.word_field.unk_token]
        unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]
        # begin prediction
        token_tensor = torch.LongTensor(numericalized_tokens)
        token_tensor = token_tensor.unsqueeze(-1)
        predictions = self.model(token_tensor)
        # convert results to tags
        top_predictions = predictions.argmax(-1)
        predicted_tags = [self.data.tag_field.vocab.itos[t.item()] for t in top_predictions]
        # print inferred tags
        max_len_token = max([len(token) for token in tokens] + [len("word")])
        max_len_tag = max([len(tag) for tag in predicted_tags] + [len("pred tag")])
        print(
            f"{'word'.ljust(max_len_token)}\t{'unk'.ljust(max_len_token)}\t{'pred tag'.ljust(max_len_tag)}"
            + ("\ttrue tag" if true_tags else "")
        )
        for i, token in enumerate(tokens):
            is_unk = "✓" if token in unks else ""
            print(
                f"{token.ljust(max_len_token)}\t{is_unk.ljust(max_len_token)}\t{predicted_tags[i].ljust(max_len_tag)}"
                + (f"\t{true_tags[i]}" if true_tags else "")
            )
        return tokens, predicted_tags, unks

In [18]:
trainer = Trainer(
    model=bilstm,
    data=corpus,
    optimizer_cls=Adam,
    loss_fn_cls=nn.CrossEntropyLoss
)
trainer.train(15)

Epoch: 01 | Epoch Time: 0m 10s
	Trn Loss: 1.289 | Trn Acc: 78.75%
	Val Loss: 0.683 | Val Acc: 85.67%
Epoch: 02 | Epoch Time: 0m 10s
	Trn Loss: 0.758 | Trn Acc: 83.13%
	Val Loss: 0.543 | Val Acc: 86.33%
Epoch: 03 | Epoch Time: 0m 10s
	Trn Loss: 0.599 | Trn Acc: 84.67%
	Val Loss: 0.421 | Val Acc: 87.82%
Epoch: 04 | Epoch Time: 0m 10s
	Trn Loss: 0.485 | Trn Acc: 86.47%
	Val Loss: 0.359 | Val Acc: 88.80%
Epoch: 05 | Epoch Time: 0m 10s
	Trn Loss: 0.422 | Trn Acc: 87.71%
	Val Loss: 0.324 | Val Acc: 89.44%
Epoch: 06 | Epoch Time: 0m 10s
	Trn Loss: 0.383 | Trn Acc: 88.56%
	Val Loss: 0.305 | Val Acc: 89.84%
Epoch: 07 | Epoch Time: 0m 10s
	Trn Loss: 0.352 | Trn Acc: 89.22%
	Val Loss: 0.285 | Val Acc: 90.53%
Epoch: 08 | Epoch Time: 0m 10s
	Trn Loss: 0.328 | Trn Acc: 89.72%
	Val Loss: 0.274 | Val Acc: 91.00%
Epoch: 09 | Epoch Time: 0m 10s
	Trn Loss: 0.307 | Trn Acc: 90.28%
	Val Loss: 0.276 | Val Acc: 90.86%
Epoch: 10 | Epoch Time: 0m 10s
	Trn Loss: 0.293 | Trn Acc: 90.63%
	Val Loss: 0.264 | Val Ac

In [19]:
# https://regional.kompas.com/read/2020/07/12/15554711/diduga-terlibat-perselingkuhan-ketua-kpu-sumba-barat-diberhentikan
sentence = "\"Menjatuhkan sanksi pemberhentian tetap kepada teradu Sophia Marlinda Djami selaku Ketua KPU Kabupaten Sumba Barat, sejak dibacakannya putusan ini\", ucap Alfitra dalam sidang putusan, Rabu (8/7/2020)."
tags = ["O", "O", "O", "O", "O", "O", "O", "B-PERSON", "I-PERSON", "L-PERSON", "O", "O", "B-ORGANIZATION", "I-ORGANIZATION", "I-ORGANIZATION", "L-ORGANIZATION", "O", "O", "O", "O", "O", "O", "O", "O", "U-PERSON", "O", "O", "O", "O", "B-TIME", "I-TIME", "I-TIME", "I-TIME", "I-TIME", "I-TIME", "I-TIME", "L-TIME", "O"]
words, infer_tags, unknown_tokens = trainer.infer(sentence=sentence, true_tags=tags)

word         	unk          	pred tag      	true tag
"            	✓            	U-PERSON      	O
menjatuhkan  	             	O             	O
sanksi       	             	O             	O
pemberhentian	             	O             	O
tetap        	             	O             	O
kepada       	             	O             	O
teradu       	✓            	O             	O
sophia       	             	B-PERSON      	B-PERSON
marlinda     	✓            	L-PERSON      	I-PERSON
djami        	✓            	O             	L-PERSON
selaku       	             	O             	O
ketua        	             	O             	O
kpu          	             	U-ORGANIZATION	B-ORGANIZATION
kabupaten    	             	O             	I-ORGANIZATION
sumba        	             	B-LOCATION    	I-ORGANIZATION
barat        	             	L-LOCATION    	L-ORGANIZATION
,            	✓            	O             	O
sejak        	             	O             	O
dibacakannya 	✓            	O             	O
putusan      	      

In [20]:
# https://regional.kompas.com/read/2020/07/15/16583081/banjir-bandang-di-masamba-19-korban-meninggal-23-hilang-15000-mengungsi
sentence = "Sementara itu, Kepala Pelaksana BPBD Luwu Utara Muslim Muchtar mengatakan, terdapat 15.000 jiwa mengungsi akibat banjir bandang."
tags = ["O", "O", "O", "O", "O", "B-ORGANIZATION", "I-ORGANIZATION", "L-ORGANIZATION", "B-PERSON", "L-PERSON", "O", "O", "O", "U-QUANTITY", "O", "O", "O", "O", "O", "O"]
words, infer_tags, unknown_tokens = trainer.infer(sentence=sentence, true_tags=tags)

word      	unk       	pred tag      	true tag
sementara 	          	O             	O
itu       	          	O             	O
,         	✓         	O             	O
kepala    	          	O             	O
pelaksana 	          	O             	O
bpbd      	✓         	O             	B-ORGANIZATION
luwu      	          	B-LOCATION    	I-ORGANIZATION
utara     	          	L-LOCATION    	L-ORGANIZATION
muslim    	          	I-ORGANIZATION	B-PERSON
muchtar   	          	U-PERSON      	L-PERSON
mengatakan	          	O             	O
,         	✓         	O             	O
terdapat  	          	O             	O
15.000    	✓         	B-QUANTITY    	U-QUANTITY
jiwa      	          	I-QUANTITY    	O
mengungsi 	          	L-QUANTITY    	O
akibat    	          	O             	O
banjir    	          	O             	O
bandang   	          	U-LOCATION    	O
.         	          	O             	O
