<a href="https://colab.research.google.com/github/vincentjordan27/Named-Entity-Recognition-BILSTM-CRF/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import sys
import csv
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchtext==0.6.0

import time
import torch
from torch import nn
from torch.optim import Adam
from torchtext.data import Field, BucketIterator
from torchtext.datasets import SequenceTaggingDataset
from spacy.lang.id import Indonesian

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l[K     |█████                           | 10 kB 25.2 MB/s eta 0:00:01[K     |██████████▏                     | 20 kB 29.4 MB/s eta 0:00:01[K     |███████████████▎                | 30 kB 30.8 MB/s eta 0:00:01[K     |████████████████████▍           | 40 kB 33.9 MB/s eta 0:00:01[K     |█████████████████████████▌      | 51 kB 36.7 MB/s eta 0:00:01[K     |██████████████████████████████▋ | 61 kB 35.6 MB/s eta 0:00:01[K     |████████████████████████████████| 64 kB 2.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 42.5 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.11.0
    Uninstalling torchtext-0.11.0:
      Successfully uninstalled torchtext-0.11.0
Successfully i

In [None]:
class Corpus(object):

  def __init__(self, input_folder, min_word_freq, batch_size):
    self.word_field = Field(lower=True)
    self.tag_field = Field(unk_token=None)
    self.train_dataset, self.val_dataset, self.test_dataset = SequenceTaggingDataset.splits(
        path=input_folder,
        train="train.tsv",
        validation="val.tsv",
        test="test.tsv",
        fields=(("word", self.word_field), ("tag", self.tag_field))
    )
    self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq)
    self.tag_field.build_vocab(self.train_dataset.tag)

    self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits(
        datasets=(self.train_dataset, self.val_dataset, self.test_dataset),
        batch_size=batch_size
    )
    self.word_pad_idx = self.word_field.vocab.stoi[self.word_field.pad_token]
    self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
    

In [None]:

corpus = Corpus(
    input_folder="/content/drive/My Drive/Colab Notebooks/",
    min_word_freq=1, 
    batch_size=64,
)
print(f"Train set: {len(corpus.train_dataset)} kata")
print(f"Val set: {len(corpus.val_dataset)} kata")
print(f"Test set: {len(corpus.test_dataset)} kata")

Train set: 3535 kata
Val set: 470 kata
Test set: 468 kata


In [None]:
class BiLSTM(nn.Module):

  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, lstm_layers,
               emb_dropout, lstm_dropout, fc_dropout, word_pad_idx):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.embedding = nn.Embedding(
        num_embeddings=input_dim, 
        embedding_dim=embedding_dim, 
        padding_idx=word_pad_idx
    )
    self.emb_dropout = nn.Dropout(emb_dropout)
    self.lstm = nn.LSTM(
        input_size=embedding_dim,
        hidden_size=hidden_dim,
        num_layers=lstm_layers,
        bidirectional=True,
        dropout=lstm_dropout if lstm_layers > 1 else 0
    )
    self.fc_dropout = nn.Dropout(fc_dropout)
    self.fc = nn.Linear(hidden_dim * 2, output_dim)  

  def forward(self, sentence):
    embedding_out = self.emb_dropout(self.embedding(sentence))
    lstm_out, _ = self.lstm(embedding_out)
    ner_out = self.fc(self.fc_dropout(lstm_out))
    return ner_out

  def init_weights(self):
    for name, param in self.named_parameters():
      nn.init.normal_(param.data, mean=0, std=0.1)

  def init_embeddings(self, word_pad_idx):
    self.embedding.weight.data[word_pad_idx] = torch.zeros(self.embedding_dim)

  def count_parameters(self):
    return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [None]:
bilstm = BiLSTM(
    input_dim=len(corpus.word_field.vocab),
    embedding_dim=10,
    hidden_dim=8,
    output_dim=len(corpus.tag_field.vocab),
    lstm_layers=1,
    emb_dropout=0.5,
    lstm_dropout=0.1,
    fc_dropout=0.25,
    word_pad_idx=corpus.word_pad_idx
)
bilstm.init_weights()
bilstm.init_embeddings(word_pad_idx=corpus.word_pad_idx)
print(f"The model has {bilstm.count_parameters():,} trainable parameters.")
print(bilstm)
print(len(corpus.word_field.vocab))

The model has 105,624 trainable parameters.
BiLSTM(
  (embedding): Embedding(10397, 10, padding_idx=1)
  (emb_dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(10, 8, bidirectional=True)
  (fc_dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=16, out_features=22, bias=True)
)
10397


In [None]:
class NER(object):

  def __init__(self, model, data, optimizer_cls, loss_fn_cls):
    self.model = model
    self.data = data
    self.optimizer = optimizer_cls(model.parameters())
    self.loss_fn = loss_fn_cls(ignore_index=self.data.tag_pad_idx)

  @staticmethod
  def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

  def accuracy(self, preds, y):
    max_preds = preds.argmax(dim=1, keepdim=True) 
    non_pad_elements = (y != self.data.tag_pad_idx).nonzero() 
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

  def epoch(self):
      epoch_loss = 0
      epoch_acc = 0
      self.model.train()
      for batch in self.data.train_iter:
        text = batch.word
        true_tags = batch.tag
        self.optimizer.zero_grad()
        pred_tags = self.model(text)
        pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
        true_tags = true_tags.view(-1)
        batch_loss = self.loss_fn(pred_tags, true_tags)
        batch_acc = self.accuracy(pred_tags, true_tags)
        print
        batch_loss.backward()
        self.optimizer.step()
        epoch_loss += batch_loss.item()
        epoch_acc += batch_acc.item()
      return epoch_loss / len(self.data.train_iter), epoch_acc / len(self.data.train_iter)

  def evaluate(self, iterator):
      epoch_loss = 0
      epoch_acc = 0
      self.model.eval()
      with torch.no_grad():

          for batch in iterator:
              text = batch.word
              true_tags = batch.tag
              pred_tags = self.model(text)
              pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
              true_tags = true_tags.view(-1)
              batch_loss = self.loss_fn(pred_tags, true_tags)
              batch_acc = self.accuracy(pred_tags, true_tags)
              epoch_loss += batch_loss.item()
              epoch_acc += batch_acc.item()
      return epoch_loss / len(iterator), epoch_acc / len(iterator)

  def train(self, n_epochs):
    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss, train_acc = self.epoch()
        end_time = time.time()
        epoch_mins, epoch_secs = NER.epoch_time(start_time, end_time)
        print(f"Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
        print(f"\tTrn Loss: {train_loss:.3f} | Trn Acc: {train_acc * 100:.2f}%")
        val_loss, val_acc = self.evaluate(self.data.val_iter)
        print(f"\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc * 100:.2f}%")
    test_loss, test_acc = self.evaluate(self.data.test_iter)
    print(f"Test Loss: {test_loss:.3f} |  Test Acc: {test_acc * 100:.2f}%")

  

In [None]:
ner = NER(
  model=bilstm,
  data=corpus,
  optimizer_cls=Adam,
  loss_fn_cls=nn.CrossEntropyLoss
)
ner.train(10)

Epoch: 01 | Epoch Time: 0m 2s
	Trn Loss: 2.946 | Trn Acc: 38.52%
	Val Loss: 2.695 | Val Acc: 85.67%
Epoch: 02 | Epoch Time: 0m 2s
	Trn Loss: 1.955 | Trn Acc: 83.03%
	Val Loss: 1.195 | Val Acc: 85.67%
Epoch: 03 | Epoch Time: 0m 2s
	Trn Loss: 1.072 | Trn Acc: 83.08%
	Val Loss: 0.867 | Val Acc: 85.67%
Epoch: 04 | Epoch Time: 0m 2s
	Trn Loss: 0.975 | Trn Acc: 83.11%
	Val Loss: 0.822 | Val Acc: 85.67%
Epoch: 05 | Epoch Time: 0m 1s
	Trn Loss: 0.939 | Trn Acc: 83.21%
	Val Loss: 0.791 | Val Acc: 85.67%
Epoch: 06 | Epoch Time: 0m 1s
	Trn Loss: 0.902 | Trn Acc: 83.09%
	Val Loss: 0.746 | Val Acc: 85.67%
Epoch: 07 | Epoch Time: 0m 1s
	Trn Loss: 0.845 | Trn Acc: 83.02%
	Val Loss: 0.687 | Val Acc: 85.67%
Epoch: 08 | Epoch Time: 0m 1s
	Trn Loss: 0.774 | Trn Acc: 83.07%
	Val Loss: 0.639 | Val Acc: 85.67%
Epoch: 09 | Epoch Time: 0m 1s
	Trn Loss: 0.720 | Trn Acc: 83.07%
	Val Loss: 0.615 | Val Acc: 85.72%
Epoch: 10 | Epoch Time: 0m 1s
	Trn Loss: 0.682 | Trn Acc: 83.19%
	Val Loss: 0.601 | Val Acc: 85.55%
