# AllenNLP 
## Must implement the following classes:
1. DatasetReader
2. Model

## Step 1.  DatasetReader

* Must implement __init__
  * must pass TokenIndexer(s) as argument to initialize
* Must implemment _read method
  * read files, http_pages etc
  * pre-process the text 
* Must implement text_to_instance 
  * convert text into instances =[ Token, Label ] 


In [None]:
from typing import Iterator, List, Dict

import torch
import torch.optim as optim
import numpy as np

from allennlp.data import Instance
from allennlp.data import Token
from allennlp.data import DatasetReader

from allennlp.data.fields import TextField, SequenceLabelField

from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer

from allennlp.common.file_utils import cached_path


class PosDatasetReader(DatasetReader):
   
    def __init__(self,token_indexers:Dict[str,TokenIndexer]=None)->None:

        
        super().__init__(lazy=False)
       
        self.token_indexers = token_indexers or {"tokens":SingleIdTokenIndexer()}
        
    
    def _read(self,file_path:str) -> Iterator[Instance]:

        with open(file_path) as f :

            for line in f:
            
                pairs = line.strip().split()
                
                sentence, tags = zip(*(pair.split("###")for pair in pairs))
                
                print("sentence : " , sentence , " tags : " , tags)
                
                yield self.text_to_instance([Token(word) for word in sentence],tags)
                
    
    def text_to_instance(self,tokens: List[Token], tags: List[str]=None)->Instance:
        
        fields = dict()
        
        sentence_field = TextField(tokens,self.token_indexers)
        
        fields["sentence"] = sentence_field # {"sentence":sentence_field}
 
        # Make label fields if tag is given
        if tags:
            label_field = SequenceLabelField(labels = tags, sequence_field=sentence_field)
            fields["labels"] = label_field
                
        # return instance list (= dataset)
        return Instance(fields)
    

    

In [None]:
# Read dataset 

# 1. Instantiate DatasetReader
reader = PosDatasetReader()

# 2. Call read to make dataset

train_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/training.txt'))

validation_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/validation.txt'))

## Step 2. Implement the Model

* Sub-class of torch.nn.Module
* Needs a forward method

  * Must return a dict of tensor outputs
  * Must include dict[loss] for training

* In our model, we'll use : 
  * embedding layer
  * sequence embedder (lstm)
  * feedforward net
  

In [None]:
from allennlp.models import Model

from allennlp.data import Vocabulary

from allennlp.modules import TextFieldEmbedder, Seq2SeqEncoder

from allennlp.training.metrics import CategoricalAccuracy

from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits

class LstmTagger(Model):
    
    
    def __init__(self,
                 word_embeddings : TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        
        super().__init__(vocab)
        
        self.word_embeddings = word_embeddings
        
        self.encoder = encoder
        
        self.hidden2tag = torch.nn.Linear(in_features  = encoder.get_output_dim(),
                                          out_features = vocab.get_vocab_size('labels'))
        
        self.accuracy = CategoricalAccuracy()
        
    
    def forward(self,
                sentence: Dict[str,torch.Tensor],
                labels: torch.Tensor = None) -> Dict[str,torch.Tensor]:
        
        mask = get_text_field_mask(sentence)
        
        embeddings = self.word_embeddings(sentence)
        
        encoder_out = self.encoder(embeddings,mask)
        
        tag_logits = self.hidden2tag(encoder_out)
        
        output = {"tag_logits":tag_logits}
        
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output['loss']=sequence_cross_entropy_with_logits(tag_logits,labels,mask)
            
        return output
    
    
    def get_metric(self,
                  reset: bool = False) -> Dict[str,float]:
        
        return {"accuracy": self.accuracy.get_metric(reset)}

# Step 3. Instantiate the model

 * must pass to the model 
   * Vocabulary of dataset
     * to get the size of label vocabulary (== output feature dimension)    
   * type of Embedder to use
   * type of Encoder to use


In [None]:
from allennlp.data import Vocabulary
# 1. Define vocabulary
vocab = Vocabulary.from_instances(train_dataset)
vocabulary_size = vocab.get_vocab_size('tokens')

In [None]:
# 2. Define embedders
from allennlp.modules import Embedding

from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

vocabulary_size = input_vocab.get_vocab_size('tokens')

token_embedding = Embedding(num_embeddings=vocabulary_size,
                            embedding_dim = 6)

word_embeddings = BasicTextFieldEmbedder({'tokens':token_embedding})

In [None]:
# 3. Define encoder 

from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper

module = torch.nn.LSTM(6,6,batch_first=True)

lstm_encoder = PytorchSeq2SeqWrapper(module)

In [None]:
# finally instantiate the model

model = LstmTagger(word_embeddings,lstm_encoder ,vocab)

## Step 4. Train the Model

In [None]:
from allennlp.training import Trainer
from allennlp.data.iterators import BucketIterator



In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.1)

iterator = BucketIterator(batch_size=2,sorting_keys=[("sentence", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model = model,
                  optimizer = optimizer,
                  iterator = iterator,
                  train_dataset = train_dataset,
                  validation_dataset = validation_dataset,
                  patience = 10,
                  num_epochs = 1000)

trainer.train()

In [None]:
# NOTE : 
raw_train_generator = iterator(train_dataset,num_epochs = 10, shuffle=True)

for batch in raw_train_generator:
    print(batch)
    
# Dict[str,torch.Tensor]    

## Step 5.  Prediction (Verification)

In [None]:
from allennlp.predictors import SentenceTaggerPredictor


In [None]:
predictor = SentenceTaggerPredictor(model, dataset_reader = reader)

tag_logits = predictor.predict("The dog ate the apple")['tag_logits']

tag_ids = np.argmax(tag_logits, axis=-1)

print ("The dog ate the apple")
print([model.vocab.get_token_from_index(i,'labels') for i in tag_ids])