# Project 3 BiLSTM-based NER

## Install the package

In [None]:
!pip install -r requirements.txt

## Data

In [13]:
from torch.utils.data import Dataset


class CHisIECDataset(Dataset):
    label_label_id_mapping = {
        "O": 0,
        "B-PER": 1,
        "I-PER": 2,
        "E-PER": 3,
        "S-PER": 4,
        "B-LOC": 5,
        "I-LOC": 6,
        "E-LOC": 7,
        "S-LOC": 8,
        "B-OFI": 9,
        "I-OFI": 10,
        "E-OFI": 11,
        "S-OFI": 12,
        "B-BOOK": 13,
        "I-BOOK": 14,
        "E-BOOK": 15,
        "S-BOOK": 16,
    }

    def __init__(self, path) -> None:
        super().__init__()
        self.data = []
        with open(path, "r", encoding="utf-8") as f:
            d = [[], []]
            while line := f.readline():
                line = line.strip()
                if line:
                    word, label = line.split()
                    d[0].append(word)
                    d[1].append(self.label_label_id_mapping[label])
                elif d[0]:
                    self.data.append(tuple(d))
                    d = [[], []]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [14]:
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import one_hot


def get_dataloader(dataset, shuffle=True):
    def collect_fn(batch):
        t = batch[0][0]
        l = one_hot(torch.tensor(batch[0][1], dtype=torch.int64), 17).float()
        return t, l

    return DataLoader(
        dataset,
        shuffle=shuffle,
        batch_size=1,
        collate_fn=collect_fn,
    )


train_set = CHisIECDataset("./CHisIEC/train.txt")
dev_set = CHisIECDataset("./CHisIEC/dev.txt")
test_set = CHisIECDataset("./CHisIEC/test.txt")

train_loader = get_dataloader(train_set)
val_loader = get_dataloader(dev_set, shuffle=False)
test_loader = get_dataloader(test_set, shuffle=False)

## Model

In [15]:
import torchtext

torchtext.disable_torchtext_deprecation_warning()
from torch import nn
from torchtext.vocab import Vectors


from torch.nn import LSTM
import torch


class MyAwesomeModel(nn.Module):

    def __init__(self, embed_dim=50, hidden_dim=50) -> None:
        super().__init__()
        self.vectors = Vectors(
            name="gigaword_chn.all.a2b.uni.ite50.vec",
            cache=".",
        )
        self.lstm = LSTM(
            embed_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True,
        )
        self.classifier = nn.Linear(hidden_dim * 2, 17)

    def forward(self, x: str):
        x = self.vectors.get_vecs_by_tokens(x).to("cuda")
        x, _ = self.lstm(x.unsqueeze(0))
        x = self.classifier(x[0])
        return x

## Training

In [16]:
from torch.optim import Adam
from sklearn.metrics import accuracy_score, f1_score

model = MyAwesomeModel().cuda()
optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()


def train(loader):
    model.train()
    epoch_loss = []
    for x, y in loader:
        optimizer.zero_grad()
        pred = model(x)
        label = y.to("cuda")
        try:
            loss = loss_fn(pred, label)
        except:
            print(pred.shape, label.shape)
        epoch_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    return {"loss": sum(epoch_loss) / len(epoch_loss)}


def eval(loader):
    model.eval()
    pred = []
    target = []
    for x, y in loader:
        _pred = model(x).argmax(-1)
        pred += _pred.tolist()
        _target = y.argmax(-1)
        target += _target.tolist()
    return {
        "accuracy": accuracy_score(target, pred),
        "f1_macro": f1_score(target, pred, average="macro"),
    }

In [None]:
from tqdm import trange
for epoch in trange(5, desc="Epoch"):
    metrics = train(train_loader)
    with torch.no_grad():
        metrics = {**eval(val_loader), **metrics}
    print(metrics)

## Evaluation

In [None]:
print(eval(test_loader))

## Conclusion

Write your conclusion in this section.