## **AllenNLP demo**

This is a demo for prediction venue based on title and abstract of the paper

reference: https://github.com/allenai/allennlp-as-a-library-example

In [1]:
import json
from typing import Iterator, List, Dict, Optional
import torch
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

# for dataset reader
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.vocabulary import Vocabulary

# read pretrained embedding from AWS S3
from allennlp.modules.token_embedders.embedding import _read_embeddings_from_text_file

# for building model
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules import FeedForward
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer

## **Create classes for the model**

- `DatasetReader`: to read dataset and return `Instance` class
- `Model`: input `Instance` class and return output prediction

In [2]:
class PublicationDatasetReader(DatasetReader):
    """
    DatasetReader for publication and venue dataaet
    """
    def __init__(self, 
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    def _read(self, file_path: str) -> Iterator[Instance]:
        """
        Read publication and venue dataset in JSON format
        
        Data is in the following format:
            {"title": ..., "paperAbstract": ..., "venue": ...}
        """
        with open(cached_path(file_path), "r") as data_file:
            for line in data_file:
                line = line.strip("\n")
                if not line:
                    continue
                paper_json = json.loads(line)
                title = paper_json['title']
                abstract = paper_json['paperAbstract']
                venue = paper_json['venue']
                yield self.text_to_instance(title, abstract, venue)
        
    def text_to_instance(self, 
                         title: str, 
                         abstract: str, 
                         venue: str=None) -> Instance:
        """
        Turn title, abstract, and venue to instance
        """
        tokenized_title = self._tokenizer.tokenize(title)
        tokenized_abstract = self._tokenizer.tokenize(abstract)
        title_field = TextField(tokenized_title, self._token_indexers)
        abstract_field = TextField(tokenized_abstract, self._token_indexers)
        fields = {'title': title_field, 
                  'abstract': abstract_field}
        if venue is not None:
            fields['label'] = LabelField(venue)
        return Instance(fields)

In [3]:
class AcademicPaperClassifier(Model):
    """
    Model to classify venue based on input title and abstract
    """
    def __init__(self, 
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 title_encoder: Seq2VecEncoder,
                 abstract_encoder: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(AcademicPaperClassifier, self).__init__(vocab, regularizer)
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.title_encoder = title_encoder
        self.abstract_encoder = abstract_encoder
        self.classifier_feedforward = classifier_feedforward
        self.metrics = {
                "accuracy": CategoricalAccuracy(),
                "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.loss = torch.nn.CrossEntropyLoss()
        initializer(self)
    
    def forward(self, 
                title: Dict[str, torch.LongTensor],
                abstract: Dict[str, torch.LongTensor],
                label: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        
        embedded_title = self.text_field_embedder(title)
        title_mask = get_text_field_mask(title)
        encoded_title = self.title_encoder(embedded_title, title_mask)

        embedded_abstract = self.text_field_embedder(abstract)
        abstract_mask = get_text_field_mask(abstract)
        encoded_abstract = self.abstract_encoder(embedded_abstract, abstract_mask)

        logits = self.classifier_feedforward(torch.cat([encoded_title, encoded_abstract], dim=-1))
        class_probabilities = F.softmax(logits, dim=-1)
        argmax_indices = np.argmax(class_probabilities.cpu().data.numpy(), axis=-1)
        labels = [self.vocab.get_token_from_index(x, namespace="labels") for x in argmax_indices]
        output_dict = {
            'logits': logits, 
            'class_probabilities': class_probabilities,
            'label': labels
        }
        if label is not None:
            loss = self.loss(logits, label)
            for metric in self.metrics.values():
                metric(logits, label)
            output_dict["loss"] = loss

        return output_dict

## **Read dataset**

- `cached_path`: can cache the file locally
- `BasicTextFieldEmbedder` takes a mapping from index names to embeddings

In [4]:
train_data_path = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/academic-papers-example/train.jsonl"
validation_data_path = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/academic-papers-example/dev.jsonl"
pretrained_file = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz"

In [5]:
reader = PublicationDatasetReader()

In [6]:
instance = reader.text_to_instance("This is a great paper.", 
                                   "Indeed, this is a great paper of all time", 
                                   "Nature")

In [7]:
train_dataset = reader.read(cached_path(train_data_path))
validation_dataset = reader.read(cached_path(validation_data_path))

15000it [01:15, 199.35it/s]
2000it [00:09, 205.04it/s]


In [8]:
# building vocabulary
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

100%|██████████| 17000/17000 [00:05<00:00, 3017.33it/s]


In [9]:
# load pre-trained embedding
embedding_matrix = _read_embeddings_from_text_file(file_uri=pretrained_file, 
                                                   embedding_dim=100, 
                                                   vocab=vocab)

400000it [00:04, 97637.21it/s]


In [10]:
print(embedding_matrix.size()) 

torch.Size([64714, 100])


In [11]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 100
num_classes = len(vocab.get_index_to_token_vocabulary('labels'))

In [12]:
# embedding
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), 
                            embedding_dim=100,
                            weight=embedding_matrix)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [13]:
lstm_title = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
                                                 batch_first=True, bidirectional=True))
lstm_abstract = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
                                                    batch_first=True, bidirectional=True))
feed_forward = torch.nn.Linear(2 * 2 * HIDDEN_DIM, num_classes)

In [14]:
model = AcademicPaperClassifier(vocab,
                                word_embeddings, 
                                lstm_title, 
                                lstm_abstract, 
                                feed_forward)

In [15]:
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [16]:
iterator = BucketIterator(batch_size=64, 
                          sorting_keys=[("abstract", "num_tokens"), 
                                        ("title", "num_tokens")])
iterator.index_with(vocab) # index with the created vocabulary

In [17]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=20)

In [18]:
trainer.train()

loss: 1.0729 ||: 100%|██████████| 235/235 [04:31<00:00,  1.15s/it]
loss: 1.0474 ||: 100%|██████████| 32/32 [00:04<00:00,  7.00it/s]
loss: 1.0157 ||: 100%|██████████| 235/235 [04:26<00:00,  1.14s/it]
loss: 0.9638 ||: 100%|██████████| 32/32 [00:03<00:00,  9.33it/s]
loss: 0.9372 ||: 100%|██████████| 235/235 [04:14<00:00,  1.08s/it]
loss: 0.9429 ||: 100%|██████████| 32/32 [00:03<00:00, 10.42it/s]
loss: 0.8471 ||: 100%|██████████| 235/235 [04:08<00:00,  1.06s/it]
loss: 0.7931 ||: 100%|██████████| 32/32 [00:03<00:00,  9.41it/s]
loss: 0.7361 ||: 100%|██████████| 235/235 [04:14<00:00,  1.08s/it]
loss: 0.6456 ||: 100%|██████████| 32/32 [00:03<00:00,  9.93it/s]
loss: 0.6335 ||: 100%|██████████| 235/235 [03:59<00:00,  1.02s/it]
loss: 0.6040 ||: 100%|██████████| 32/32 [00:03<00:00, 10.60it/s]
loss: 0.5870 ||: 100%|██████████| 235/235 [03:17<00:00,  1.19it/s]
loss: 0.5956 ||: 100%|██████████| 32/32 [00:03<00:00,  9.11it/s]
loss: 0.5564 ||: 100%|██████████| 235/235 [03:28<00:00,  1.13it/s]
loss: 0.5

{'training_duration': '01:18:26',
 'training_start_epoch': 0,
 'training_epochs': 19,
 'epoch': 19,
 'training_loss': 0.3964861498868212,
 'validation_loss': 0.478450040332973,
 'best_epoch': 18,
 'best_validation_loss': 0.4765144931152463}

## **Prediction**

Lastly, we can also write the prediction class which take input any `json_dict` and return the `Instance`. 

In [22]:
from allennlp.common.util import JsonDict
from allennlp.predictors.predictor import Predictor

In [23]:
class PaperClassifierPredictor(Predictor):
    """"
    Predictor wrapper for the AcademicPaperClassifier
    """
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        title = json_dict['title']
        abstract = json_dict['paperAbstract']
        instance = self._dataset_reader.text_to_instance(title=title, abstract=abstract)

        # label_dict will be like {0: "ACL", 1: "AI", ...}
        label_dict = self._model.vocab.get_index_to_token_vocabulary('labels')
        # Convert it to list ["ACL", "AI", ...]
        all_labels = [label_dict[i] for i in range(len(label_dict))]

        return instance

In [24]:
predictor = PaperClassifierPredictor(model, dataset_reader=reader)

In [26]:
prediction_output = predictor.predict_json(
    {
        "title": "Know What You Don't Know: Unanswerable Questions for SQuAD", 
        "paperAbstract": "Extractive reading comprehension systems can often locate the correct answer to a question in a context document, but they also tend to make unreliable guesses on questions for which the correct answer is not stated in the context. Existing datasets either focus exclusively on answerable questions, or use automatically generated unanswerable questions that are easy to identify. To address these weaknesses, we present SQuAD 2.0, the latest version of the Stanford Question Answering Dataset (SQuAD). SQuAD 2.0 combines existing SQuAD data with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD 2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering. SQuAD 2.0 is a challenging natural language understanding task for existing models: a strong neural system that gets 86% F1 on SQuAD 1.1 achieves only 66% F1 on SQuAD 2.0."
    }
)

In [27]:
print(prediction_output)

{'logits': [2.2441415786743164, -0.1285620778799057, -2.3133716583251953], 'class_probabilities': [0.9060297608375549, 0.08446764945983887, 0.009502537548542023], 'label': 'ACL'}
