# Data Pipeline

   ##  Hierarchical system of data structures 
   * Easy padding 
   * Easy batching
   * Easy iteration

## Steps to feed the dataset into PyTorch model 
 * Create a vocabulary from the dataset
     * Use Vocabulary.from_instances
 * Collect instances into a Batch
     * Provides methods for indexing and converting to Pytorch Tensors
 * Index the words and labels in Fields
     * In order to use the integer indices specified by the Vocabulary
 * Pad the instances to the same length
 * Convert into Pytorch Tensors
 

### Vocabulary Creation

In [None]:
from allennlp.data import Vocabulary

vocab = Vocabulary.from_instances(instances)

###  word_id --> token(word) mapping
   * get_index_to_token_vocabulary

In [None]:
# token_ids namespace

print('id -> word mapping for the "token_ids" namespace: ')
print(vocab.get_index_to_token_vocabulary("token_ids"),"\n")

In [None]:
# tags namespace

print('id -> word mapping for the "tags" namespace: ')
print(vocab.get_index_to_token_vocabulary('tags'), '\n')

###  Token(word) -> id 

In [None]:
print('Token to Index dictionary: \n' ,vocab._token_to_index, '\n')

### Collect Instances(dataset) into Batch and Index them
  * Must perform this step before generating Tensors

In [None]:
from allennlp.data.dataset import Batch

batch = Batch(instances)
# index batch using vocabulary
batch.index_instances(vocab)

### Pad the instances to the same length

In [None]:
# get the padding lenth 

padding_lengths = batch.get_padding_lengths()
print("Lengths used for padding : ", padding_lengths, "\n")

# padd instances and return Pytorch Tensors 
tensor_dict = batch.as_tensor_dict(padding_lengths)
print("Look how tensors are padded!!! \n", tensor_dict)

# The role of TokenIndexer

* Conventional pre-processing flow
 * token --> indexing --> embedding

* AllenNLP pre-processing flow
 * token --> token_indexer --> token_embedder --> TextField
 
* What if we want to use multiple Indexer 
 * e.g. TokenCharacterIndexer --> generates indices for each character in a token

In [None]:
# for large batch --> Interator
# fixed batch size, bucketing, stocharsing sorting

# Normal FLow  : tokenization -> indexing -> embedding pipeline
# Allennlp     : tokenization -> TokenIndexers -> TokenEmbedders -> TextFieldEmbedders

# ex ) TokenCharacterIndexer --> takes the word in a TextField 
#                               and generates indices for the character in the word

In [None]:
from allennlp.data.token_indexers import TokenCharactersIndexer

tokens = list(map(Token,['here','are','some','longer','words','.']))
token_indexers = {'tokens':SingleIdTokenIndexer(namespace='token_ids'),
                  'chars':TokenCharactersIndexer(namespace='token_chars')}

word_and_character_text_field = TextField(tokens,token_indexers)

mini_dataset = Batch([Instance({"sentence":word_and_character_text_field})])

word_and_char_vocab = Vocabulary.from_instances(mini_dataset)

mini_dataset.index_instances(word_and_char_vocab)

print("this is the id -> word mapping for the 'tokens_ids' namesapce: ")
print(word_and_char_vocab.get_index_to_token_vocabulary("token_ids"), "\n")
print("this is the id -> word mapping for the 'token_chars' namespace: ")
print(word_and_char_vocab.get_index_to_token_vocabulary("token_chars"),'\m')

In [None]:
padding_lengths = mini_dataset.get_padding_lengths()
print("Lengths used for padding( Note that we now have a new \n"
     "padding key num_tokens_characters from the TokenCharactersIndexer):")
print(padding_lengths, "\n")

tensor_dict = mini_dataset.as_tensor_dict(padding_lengths)

print("The resulting PyTorch Tensor is : \n",tensor_dict)

In [None]:
#Note that the keys for the dictionary of token_indexers 
#for the TextField are different from the namespaces. 
#This is because it's possible to re-use a namespace in different TokenIndexer
token_indexers