## **AllenNLP demo**

This is a demo for prediction venue based on title and abstract of the paper

reference: https://github.com/allenai/allennlp-as-a-library-example

In [5]:
import json
from typing import Iterator, List, Dict, Optional
import torch
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

# for dataset reader
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.vocabulary import Vocabulary

# read pretrained embedding from AWS S3
from allennlp.modules.token_embedders.embedding import _read_embeddings_from_text_file

# for building model
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules import FeedForward
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer

ModuleNotFoundError: No module named 'allennlp'

## **Create classes for the model**

Generally, we need to implement 2 classes for AllenNLP including

- `DatasetReader`: to read dataset and return `Instance` class
- `Model`: input `Instance` class and return output prediction

`Model` consists of the Sequence to Vector model (`Seq2Vec`)

<img src="figures/bilstm.png" width="300"/>


and we use the combination of vectors to predict venue

<img src="figures/venue_prediction.png" width="300"/>

In [2]:
class PublicationDatasetReader(DatasetReader):
    """
    DatasetReader for publication and venue dataaet
    """
    def __init__(self, 
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None, 
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    def _read(self, file_path: str) -> Iterator[Instance]:
        """
        Read publication and venue dataset in JSON format
        
        Data is in the following format:
            {"title": ..., "paperAbstract": ..., "venue": ...}
        """
        with open(cached_path(file_path), "r") as data_file:
            for line in data_file:
                line = line.strip("\n")
                if not line:
                    continue
                paper_json = json.loads(line)
                title = paper_json['title']
                abstract = paper_json['paperAbstract']
                venue = paper_json['venue']
                yield self.text_to_instance(title, abstract, venue)
        
    def text_to_instance(self, 
                         title: str, 
                         abstract: str, 
                         venue: str=None) -> Instance:
        """
        Turn title, abstract, and venue to instance
        """
        tokenized_title = self._tokenizer.tokenize(title)
        tokenized_abstract = self._tokenizer.tokenize(abstract)
        title_field = TextField(tokenized_title, self._token_indexers)
        abstract_field = TextField(tokenized_abstract, self._token_indexers)
        fields = {'title': title_field, 
                  'abstract': abstract_field}
        if venue is not None:
            fields['label'] = LabelField(venue)
        return Instance(fields)

In [3]:
class AcademicPaperClassifier(Model):
    """
    Model to classify venue based on input title and abstract
    """
    def __init__(self, 
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 title_encoder: Seq2VecEncoder,
                 abstract_encoder: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(AcademicPaperClassifier, self).__init__(vocab, regularizer)
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.title_encoder = title_encoder
        self.abstract_encoder = abstract_encoder
        self.classifier_feedforward = classifier_feedforward
        self.metrics = {
                "accuracy": CategoricalAccuracy(),
                "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.loss = torch.nn.CrossEntropyLoss()
        initializer(self)
    
    def forward(self, 
                title: Dict[str, torch.LongTensor],
                abstract: Dict[str, torch.LongTensor],
                label: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        
        embedded_title = self.text_field_embedder(title)
        title_mask = get_text_field_mask(title)
        encoded_title = self.title_encoder(embedded_title, title_mask)

        embedded_abstract = self.text_field_embedder(abstract)
        abstract_mask = get_text_field_mask(abstract)
        encoded_abstract = self.abstract_encoder(embedded_abstract, abstract_mask)

        logits = self.classifier_feedforward(torch.cat([encoded_title, encoded_abstract], dim=-1))
        class_probabilities = F.softmax(logits, dim=-1)
        argmax_indices = np.argmax(class_probabilities.cpu().data.numpy(), axis=-1)
        labels = [self.vocab.get_token_from_index(x, namespace="labels") for x in argmax_indices]
        output_dict = {
            'logits': logits, 
            'class_probabilities': class_probabilities,
            'predicted_label': labels
        }
        if label is not None:
            loss = self.loss(logits, label)
            for metric in self.metrics.values():
                metric(logits, label)
            output_dict["loss"] = loss

        return output_dict

## **Read dataset**

- `cached_path`: can cache the file locally
- `BasicTextFieldEmbedder` takes a mapping from index names to embeddings

In [4]:
train_data_path = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/academic-papers-example/train.jsonl"
validation_data_path = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/academic-papers-example/dev.jsonl"
pretrained_file = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz"

In [5]:
reader = PublicationDatasetReader()

In [6]:
instance = reader.text_to_instance("This is a great paper.", 
                                   "Indeed, this is a great paper of all time", 
                                   "Nature")

In [7]:
train_dataset = reader.read(cached_path(train_data_path))
validation_dataset = reader.read(cached_path(validation_data_path))

15000it [00:35, 428.37it/s]
2000it [00:04, 419.87it/s]


In [8]:
# building vocabulary
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

100%|██████████| 17000/17000 [00:02<00:00, 6340.02it/s]


In [9]:
# load pre-trained embedding
embedding_matrix = _read_embeddings_from_text_file(file_uri=pretrained_file, 
                                                   embedding_dim=100, 
                                                   vocab=vocab)

400000it [00:04, 80881.50it/s] 


In [10]:
print(embedding_matrix.size()) 

torch.Size([64247, 100])


In [11]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 100
num_classes = len(vocab.get_index_to_token_vocabulary('labels'))

In [12]:
# embedding
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), 
                            embedding_dim=EMBEDDING_DIM,
                            weight=embedding_matrix)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [13]:
lstm_title = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
                                                 batch_first=True, bidirectional=True))
lstm_abstract = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
                                                    batch_first=True, bidirectional=True))
feed_forward = torch.nn.Linear(2 * 2 * HIDDEN_DIM, num_classes)

In [14]:
model = AcademicPaperClassifier(vocab,
                                word_embeddings, 
                                lstm_title, 
                                lstm_abstract, 
                                feed_forward)

In [15]:
optimizer = optim.SGD(model.parameters(), lr=0.005)

In [16]:
iterator = BucketIterator(batch_size=64, 
                          sorting_keys=[("abstract", "num_tokens"), 
                                        ("title", "num_tokens")])
iterator.index_with(vocab) # index with the created vocabulary

In [17]:
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=validation_dataset,
    patience=2,
    num_epochs=5,
    serialization_dir='output'
)

In [18]:
trainer.train()

loss: 1.0962 ||: 100%|██████████| 235/235 [40:56<00:00, 10.45s/it]  
loss: 1.0874 ||: 100%|██████████| 32/32 [00:07<00:00,  4.16it/s]
loss: 1.0858 ||: 100%|██████████| 235/235 [39:35<00:00, 10.11s/it]  
loss: 1.0808 ||: 100%|██████████| 32/32 [00:05<00:00,  5.42it/s]
loss: 1.0815 ||: 100%|██████████| 235/235 [39:27<00:00, 10.07s/it] 
loss: 1.0766 ||: 100%|██████████| 32/32 [00:05<00:00,  5.62it/s]
loss: 1.0779 ||: 100%|██████████| 235/235 [45:42<00:00, 11.67s/it]  
loss: 1.0725 ||: 100%|██████████| 32/32 [00:05<00:00,  5.63it/s]
loss: 1.0742 ||: 100%|██████████| 235/235 [40:22<00:00, 10.31s/it]  
loss: 1.0688 ||: 100%|██████████| 32/32 [00:05<00:00,  5.44it/s]


{'best_epoch': 4,
 'peak_cpu_memory_MB': 1412.108288,
 'training_duration': '3:26:36.009762',
 'training_start_epoch': 0,
 'training_epochs': 4,
 'epoch': 4,
 'training_loss': 1.0742415935435194,
 'training_cpu_memory_MB': 1412.108288,
 'validation_loss': 1.0688360258936882,
 'best_validation_loss': 1.0688360258936882}

## **Prediction**

Lastly, we can also write the prediction class `PaperClassifierPredictor` which take input any `json_dict` and return the `Instance`. The AllenNLP will take care of the prediction.

In [19]:
from allennlp.common.util import JsonDict
from allennlp.predictors.predictor import Predictor

In [20]:
class PaperClassifierPredictor(Predictor):
    """"
    Predictor wrapper for the AcademicPaperClassifier
    """
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        title = json_dict['title']
        abstract = json_dict['paperAbstract']
        instance = self._dataset_reader.text_to_instance(title=title, abstract=abstract)
        return instance

In [21]:
predictor = PaperClassifierPredictor(model, dataset_reader=reader)

In [22]:
prediction_output = predictor.predict_json(
    {
        "title": "Know What You Don't Know: Unanswerable Questions for SQuAD", 
        "paperAbstract": "Extractive reading comprehension systems can often locate the correct answer to a question in a context document, but they also tend to make unreliable guesses on questions for which the correct answer is not stated in the context. Existing datasets either focus exclusively on answerable questions, or use automatically generated unanswerable questions that are easy to identify. To address these weaknesses, we present SQuAD 2.0, the latest version of the Stanford Question Answering Dataset (SQuAD). SQuAD 2.0 combines existing SQuAD data with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD 2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering. SQuAD 2.0 is a challenging natural language understanding task for existing models: a strong neural system that gets 86% F1 on SQuAD 1.1 achieves only 66% F1 on SQuAD 2.0."
    }
)

print(prediction_output)

{'logits': [0.07398353517055511, 0.08393894135951996, -0.1481025218963623], 'class_probabilities': [0.35576409101486206, 0.35932356119155884, 0.2849124073982239], 'predicted_label': 'AI'}


## **Load model for prediction**

Here, we trained the model and save it in `output` folder using 

```
allennlp train example_training.json -s output --include-package venue
```

`venue` is a library that we created where we make AllenNLP as a library. We can load trained model (`model.tar.gz`) from `serialization_dir` (`output`) and use it to predict the classes.

In [23]:
from allennlp.models.archival import load_archive
from allennlp.predictors.predictor import Predictor
from venue.venue_predictor import PaperClassifierPredictor
from venue.venue_reader import PublicationDatasetReader
from venue.venue_classifier import AcademicPaperClassifier

archive = load_archive('output/model.tar.gz')
venue_predictor = Predictor.from_archive(archive, 'venue_predictor')

  "num_layers={}".format(dropout, num_layers))


In [24]:
prediction_output = venue_predictor.predict_json(
    {
        "title": "Know What You Don't Know: Unanswerable Questions for SQuAD", 
        "paperAbstract": "Extractive reading comprehension systems can often locate the correct answer to a question in a context document, but they also tend to make unreliable guesses on questions for which the correct answer is not stated in the context. Existing datasets either focus exclusively on answerable questions, or use automatically generated unanswerable questions that are easy to identify. To address these weaknesses, we present SQuAD 2.0, the latest version of the Stanford Question Answering Dataset (SQuAD). SQuAD 2.0 combines existing SQuAD data with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD 2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering. SQuAD 2.0 is a challenging natural language understanding task for existing models: a strong neural system that gets 86% F1 on SQuAD 1.1 achieves only 66% F1 on SQuAD 2.0."
    }
)

print(prediction_output)

{'logits': [1.3005162477493286, 0.6157347559928894, -2.2390246391296387], 'class_probabilities': [0.6522191166877747, 0.3288491368293762, 0.018931739032268524], 'predicted_label': 'ACL'}


In [25]:
venue_predictor._model.vocab.get_index_to_token_vocabulary('labels') # all classes

{0: 'ACL', 1: 'AI', 2: 'ML'}

## **Predict which journals to submit from Medline**

We  do a fun experiment where we train the same model to classify 
publications from sample 110 journals from MEDLINE. We got accuracy of 64.9 percent on the validation dataset.

In [26]:
import torch
from allennlp.models.archival import load_archive
from allennlp.predictors.predictor import Predictor
from venue.venue_predictor import PaperClassifierPredictor
from venue.venue_reader import PublicationDatasetReader
from venue.venue_classifier import AcademicPaperClassifier
from allennlp.common.file_utils import cached_path
import torch.nn.functional as F

In [None]:
archive = load_archive(cached_path('https://s3-us-west-2.amazonaws.com/allennlp-tutorial/model.tar.gz'))
venue_predictor = Predictor.from_archive(archive, 'venue_predictor')

  0%|          | 1549312/791998690 [00:29<4:38:12, 47354.63B/s]

In [None]:
title = """
Modeling peripheral visual acuity enables discovery of gaze strategies 
at multiple time scales during natural scene search
"""
abstract = """
Like humans, monkeys make saccades nearly three times a second. 
To understand the factors guiding this frequent decision, computational models of vision 
attempt to predict fixation locations using bottom-up visual features and top-down goals. 
How do the relative influences of these factors evolve over multiple time scales? 
Here we analyzed visual features at fixations using a retinal transform that provides realistic 
visual acuity by suitably degrading visual information in the periphery. 
In a task in which monkeys searched for a Gabor target in natural scenes, we characterized 
the relative importance of bottom-up and task-relevant influences by decoding fixated from 
nonfixated image patches based on visual features. At fast time scales, we found that search 
strategies can vary over the course of a single trial, with locations of higher saliency, target-similarity, 
edge–energy, and orientedness looked at later on in the trial. At slow time scales, we found that 
search strategies can be refined over several weeks of practice, and the influence of target orientation 
was significant only in the latter of two search tasks. Critically, these results were not observed without 
applying the retinal transform. Our results suggest that saccade-guidance strategies become apparent only 
when models take into account degraded visual representation in the periphery.'
"""
prediction_output = venue_predictor.predict_json(
    {
        "title": title, 
        "paperAbstract": abstract
    }
)

In [None]:
venues = venue_predictor._model.vocab.get_index_to_token_vocabulary('labels') # all classes
venues = [venues[i] for i in range(len(venues))]

In [None]:
# rank top 5 which journal to submit
sorted(list(zip(venues, prediction_output['class_probabilities'])), 
       key=lambda x: x[1], reverse=True)[0:5]