In [0]:
!pip -qq install torch
!pip -qq install torchtext
!pip -qq install spacy
!python -m spacy download en
!python -m spacy download en_core_web_sm

In [0]:
!wget https://raw.githubusercontent.com/svinkapeppa/boolq/master/train.jsonl
!wget https://raw.githubusercontent.com/svinkapeppa/boolq/master/dev.jsonl

In [0]:
import os
import warnings
warnings.filterwarnings('ignore')

import en_core_web_sm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

from tqdm import tqdm

from torchtext.vocab import Vectors
from torchtext.data import Example, Field, Dataset, NestedField, BucketIterator

In [0]:
class FastText(Vectors):
    def __init__(self, **kwargs):
        name = os.path.basename(kwargs["url"])
        super(FastText, self).__init__(name, **kwargs)

In [0]:
vectors = FastText(url="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec", max_vectors=30000)

In [0]:
nlp = en_core_web_sm.load()

In [0]:
class DataLoader:
    def __init__(self):
        self.char_field = NestedField(
            Field(batch_first=True, tokenize=list, lower=True),
            init_token="<SOS>",
            eos_token="<EOS>",
            tokenize="spacy",
        )
        self.word_field = Field(
            init_token="<SOS>",
            eos_token="<EOS>",
            lower=True,
            tokenize="spacy",
        )
        self.target_field = Field(
            is_target=True,
            sequential=False,
            use_vocab=False,
        )

        self.fields = [
                ("question_char", self.char_field),
                ("context_char", self.char_field),
                ("question", self.word_field),
                ("context", self.word_field),
                ("answer", self.target_field),
        ]
        self.dict_fields = {
            "context": [("context_char", self.char_field), ("context", self.word_field)],
            "question": [("question_char", self.char_field), ("question", self.word_field)],
            "answer": ("answer", self.target_field),
        }

    def create_dataset(self, path: str = None) -> Dataset:
        df = pd.read_json(path, lines=True, orient="records")

        data = pd.DataFrame()
        data["context"] = df["title"] + " " + df["passage"]
        data["question"] = df["question"]
        data["answer"] = df["answer"]

        items = data.to_dict("records")

        return Dataset([Example.fromdict(item, fields=self.dict_fields) for item in items], self.fields)

    def build(self, train_path: str = None, dev_path: str = None, vectors: Vectors = None) -> None:
        self.train = self.create_dataset(path=train_path)
        self.dev = self.create_dataset(path=dev_path)

        self.char_field.build_vocab(self.train)
        self.word_field.build_vocab(self.train, vectors=vectors)

        pos, ner = [], []
        ind2pos, ind2ner = [], []

        for data in self.train:
            doc = nlp(" ".join(data.question) + " " + " ".join(data.context))

            pos.extend([token.pos_ for token in doc])
            ner.extend([token.label_ for token in doc.ents])

            ind2pos.extend([(self.word_field.vocab.stoi[str(token)], token.pos_) for token in doc])
            ind2ner.extend([(self.word_field.vocab.stoi[str(token)], token.label_) for token in doc.ents])

        self.pos_vocab = {tag: i for i, tag in enumerate(set(pos))}
        self.ner_vocab = {tag: i + 1 for i, tag in enumerate(set(ner))}
        self.ner_vocab["<UNK>"] = 0

        self.ind2pos = {tag[0]: self.pos_vocab[tag[1]] for tag in ind2pos}
        self.ind2ner = {tag[0]: self.ner_vocab[tag[1]] for tag in ind2ner}

In [0]:
loader = DataLoader()
loader.build(train_path="train.jsonl", dev_path="dev.jsonl", vectors=vectors)

In [0]:
train_iter = BucketIterator(loader.train, batch_size=32, shuffle=True, sort_key=lambda x: len(x.context))
dev_iter = BucketIterator(loader.dev, batch_size=128, shuffle=True, sort_key=lambda x: len(x.context))

In [0]:
class Model(nn.Module):
    def __init__(
        self,
        weights,
        char_vocab_size: int = None,
        char_emb_dim: int = None,
        char_hidden_size: int = None,
        char_kernel_size: int = None,
        emb_dim: int = None,
        hidden_size: int = None,
        dropout: float = None,
    ):
        super().__init__()

        self.char_vocab_size = char_vocab_size
        self.char_emb_dim = char_emb_dim
        self.char_hidden_size = char_hidden_size
        self.char_kernel_size = char_kernel_size

        self.emb_dim = emb_dim
        self.hidden_size = hidden_size

        self.char_emb = nn.Embedding(self.char_vocab_size, self.char_emb_dim)
        self.word_emb = nn.Embedding.from_pretrained(weights, freeze=True)

        self.char_conv = nn.Conv2d(1, self.char_hidden_size, (self.char_emb_dim, self.char_kernel_size))

        self.alpha = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(6 * self.hidden_size, 1)
        )

        self.contextual_lstm = nn.LSTM(
            input_size=self.emb_dim + self.char_hidden_size,
            hidden_size=self.hidden_size,
            bidirectional=True,
            batch_first=True,
            dropout=dropout
        )
        self.modeling_lstm_first = nn.LSTM(
            input_size=8 * self.hidden_size,
            hidden_size=self.hidden_size,
            bidirectional=True,
            batch_first=True,
            dropout=dropout
        )
        self.modeling_lstm_second = nn.LSTM(
            input_size=2 * self.hidden_size,
            hidden_size=self.hidden_size,
            bidirectional=True,
            batch_first=True,
            dropout=dropout
        )
        self.lstm = nn.LSTM(
            input_size=10 * hidden_size,
            hidden_size=self.hidden_size,
            bidirectional=True,
            batch_first=True,
            dropout=dropout
        )

        self.out = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(4 * hidden_size, 2)
        )

        self.dropout = nn.Dropout(p=dropout)

    def embed(self, batch):
        batch_size = batch.size(0)

        emb = self.char_emb(batch)
        emb = self.dropout(emb)

        emb = emb.transpose(2, 3)
        emb = emb.view(-1, self.char_emb_dim, emb.size(3)).unsqueeze(1)

        emb = self.char_conv(emb).squeeze()
        emb = F.max_pool1d(emb, emb.size(2)).squeeze()

        emb = emb.view(batch_size, -1, self.char_hidden_size)

        return emb

    def attention(self, context, question):
        tensor = torch.cat([
            context.unsqueeze(2).expand(context.size(0), context.size(1), question.size(1), -1),
            question.unsqueeze(1).expand(context.size(0), context.size(1), question.size(1), -1),
            context.unsqueeze(2) * question.unsqueeze(1)
        ], dim=-1)
        s = self.alpha(tensor).squeeze()

        a = F.softmax(s, dim=2)
        context_question_attention = torch.bmm(a, question)

        b = F.softmax(torch.max(s, dim=2)[0], dim=1).unsqueeze(1)
        question_context_attention = torch.bmm(b, context).squeeze()
        question_context_attention = question_context_attention.unsqueeze(1).expand(-1, context.size(1), -1)

        result = torch.cat([
                      context,
                      context_question_attention,
                      context * context_question_attention,
                      context * question_context_attention
        ], dim=-1)

        return result

    def forward(self, batch):
        context_char_emb = self.embed(batch.context_char)
        question_char_emb = self.embed(batch.question_char)
        
        context_word_emb = self.word_emb(batch.context.transpose(0, 1))
        question_word_emb = self.word_emb(batch.question.transpose(0, 1))

        context = torch.cat([context_char_emb, context_word_emb], dim=-1)
        question = torch.cat([question_char_emb, question_word_emb], dim=-1)

        context, _ = self.contextual_lstm(context)
        question, _ = self.contextual_lstm(question)

        g = self.attention(context, question)

        features, _ = self.modeling_lstm_first(g)
        features, _ = self.modeling_lstm_second(features)

        _, features = self.lstm(torch.cat([g, features], dim=-1))
        features = torch.cat((
            features[0].permute(1, 0, 2).reshape(batch.context.size(1), 2 * self.hidden_size),
            features[1].permute(1, 0, 2).reshape(batch.context.size(1), 2 * self.hidden_size)
        ), dim=1)

        out = self.out(features)

        return out

In [0]:
class ModelTrainer:
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        
    def on_epoch_begin(self, is_train, name, batches_count) -> None:
        self.epoch_loss = 0
        self.correct_count, self.total_count = 0, 0
        self.is_train = is_train
        self.name = name
        self.batches_count = batches_count
        self.model.train(is_train)
        
    def on_epoch_end(self) -> str:
        return '{:>5s} Loss = {:.5f}, Accuracy = {:.2%}'.format(
            self.name, self.epoch_loss / self.batches_count, self.correct_count / self.total_count
        )
        
    def on_batch(self, batch) -> str:
        logits = self.model(batch)
        target = batch.answer
        prediction = torch.max(logits, axis=1)[1]

        loss = self.criterion(logits, target)

        self.total_count += prediction.size(0)
        self.correct_count += torch.sum(prediction == target).item()

        if self.is_train:
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
            self.optimizer.step()
            self.optimizer.zero_grad()

        self.epoch_loss += loss.item()

        return '{:>5s} Loss = {:.5f}, Accuracy = {:.2%}'.format(
            self.name, loss.item(), torch.sum(prediction == target).item() / prediction.size(0)
        )

In [0]:
tqdm.get_lock().locks = []

def do_epoch(
    trainer: ModelTrainer = None,
    data_iter: BucketIterator = None,
    is_train: bool = None,
    name: str = None
) -> None:
    trainer.on_epoch_begin(is_train=is_train, name=name, batches_count=len(data_iter))

    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=trainer.batches_count) as progress_bar:
            for i, batch in enumerate(data_iter):
                batch_progress = trainer.on_batch(batch=batch)

                progress_bar.update()
                progress_bar.set_description(batch_progress)

            epoch_progress = trainer.on_epoch_end()

            progress_bar.set_description(epoch_progress)
            progress_bar.refresh()

def fit(
    trainer: ModelTrainer = None,
    train_iter: BucketIterator = None,
    epochs_count: int = None,
    dev_iter: BucketIterator = None
) -> None:
    best_val_loss = None

    for epoch in range(epochs_count):
        try:
            name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
            do_epoch(trainer=trainer, data_iter=train_iter, is_train=True, name=name_prefix + 'Train:')

            if not dev_iter is None:
                do_epoch(trainer=trainer, data_iter=dev_iter, is_train=False, name=name_prefix + '  Val:')
        except KeyboardInterrupt:
            print("Early stopping")
            return

In [26]:
weights = loader.word_field.vocab.vectors
model = Model(
    weights=weights,
    char_vocab_size=len(loader.char_field.vocab),
    char_emb_dim=15,
    char_hidden_size=15,
    char_kernel_size=5,
    emb_dim=300,
    hidden_size=64,
    dropout=0.3
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
trainer = ModelTrainer(model=model, criterion=criterion, optimizer=optimizer)
fit(trainer=trainer, train_iter=train_iter, epochs_count=5, dev_iter=dev_iter)

[1 / 5] Train: Loss = 0.66095, Accuracy = 62.03%: 100%|██████████| 295/295 [1:06:55<00:00, 13.61s/it]
[1 / 5]   Val: Loss = 0.64521, Accuracy = 62.26%: 100%|██████████| 26/26 [01:54<00:00,  4.39s/it]
[2 / 5] Train: Loss = 0.63368, Accuracy = 64.48%: 100%|██████████| 295/295 [1:06:58<00:00, 13.62s/it]
[2 / 5]   Val: Loss = 0.68701, Accuracy = 62.35%: 100%|██████████| 26/26 [01:52<00:00,  4.33s/it]
[3 / 5] Train: Loss = 0.60286, Accuracy = 67.36%: 100%|██████████| 295/295 [1:06:55<00:00, 13.61s/it]
[3 / 5]   Val: Loss = 0.63903, Accuracy = 64.25%: 100%|██████████| 26/26 [01:50<00:00,  4.24s/it]
[4 / 5] Train: Loss = 0.56484, Accuracy = 70.60%: 100%|██████████| 295/295 [1:08:35<00:00, 13.95s/it]
[4 / 5]   Val: Loss = 0.65187, Accuracy = 64.62%: 100%|██████████| 26/26 [01:58<00:00,  4.57s/it]
[5 / 5] Train: Loss = 0.51873, Accuracy = 74.32%: 100%|██████████| 295/295 [1:06:45<00:00, 13.58s/it]
[5 / 5]   Val: Loss = 0.74313, Accuracy = 61.38%: 100%|██████████| 26/26 [01:55<00:00,  4.44s/it]
