# Vocabulary

* Given a dataset, Vocabulary decides which tokens are in vocabulary
* Or Can be loaded directly from a static vocabulary file.

In [1]:
from allennlp.data import Vocabulary

## Empty vocabulary creation

In [3]:
vocab = Vocabulary(counter=None, min_count=1, max_vocab_size=100000)

* Param1 : A counter (Dict[str,Dict[str,int]])

  * nested dictionary 
    * first level     :  namespace ( textrual input , and  tag(NP,VP etc))
    * second level    :  Dict[tokens] -> counts (NP -> 1)
* Param2 : A minimum cmount
    * Tokens with smaller counts than this value are excluded from vocabulary
* Param3 : A maximum vocab size: 
    * words are dropped if exceeds this value
* Param4 : Non padded namespaces 
    *  *labels and *tags (ex. sequence_labels)

## Vocabulary manipulation
    * Adding words (tokens)
        * add_token_to_namespace
    * Retrieve token(word) given word_id
        * get_token_from_index
    * Retrieve word_id given token (word)
        * get_token_index

In [4]:
# easy to interact with vocabulary
vocab.add_token_to_namespace("Barack",namespace="tokens")
vocab.add_token_to_namespace("Obama", namespace='tokens')
vocab.add_token_to_namespace("PERSON", namespace='tags')
vocab.add_token_to_namespace("PLACE",namespace='tags')

print(vocab.get_index_to_token_vocabulary("tokens"))
print(vocab.get_index_to_token_vocabulary("tags"))

{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'Barack', 3: 'Obama'}
{0: 'PERSON', 1: 'PLACE'}


In [5]:
# easy to retrieve tokens given word_ids
print("Index 2 has token: ", 
      vocab.get_token_from_index(2,namespace='tokens'))

Index 2 has token:  Barack


In [6]:
# easy to retrieve specific word_ids 
print("Word 'Barack' has index : ",
      vocab.get_token_index("Barack",namespace='tokens'))

Word 'Barack' has index :  2


## Notice the different ways of dealing with missing words

    * Textual inputs : Simply returns "@@UNKONWN@@": index 1
    
    * Label inputs : returns KeyError 

In [9]:
# Text Fields

print("The index of 'pernacious' in the 'tokens' namespace should be 1"
     "(The @@UNKONWN@@ token): ",vocab.get_token_index("pernacious",namespace="tokens"))

The index of 'pernacious' in the 'tokens' namespace should be 1(The @@UNKONWN@@ token):  1


In [10]:
# Label Fields

try:
    vocab.get_token_index("pernacious",namespace="tags")
except KeyError:
    print("As 'tags' doesn't have an unknown token, getting non_existent tags will throw a KeyError")


Namespace: tags
Token: pernacious


As 'tags' doesn't have an unknown token, getting non_existent tags will throw a KeyError


# Generating vocabulary from Instances (Dataset)

In [18]:
# generating vocabulary from instances ( dataset)

from allennlp.data.fields import TextField,SequenceLabelField
from allennlp.data import Instance, Token
from allennlp.data.token_indexers import SingleIdTokenIndexer

tokens = list(map(Token,["Barack","Obama" ,"is","a","great","guy","."]))
token_indexers = {"tokens":SingleIdTokenIndexer()}
sentence = TextField(tokens,token_indexers)

tags = SequenceLabelField(["PERSON","PERSON","O","O","O","O","O"],sentence,label_namespace='tags')
instances =[Instance({'sentence':sentence,'tags':tags})]

In [20]:
vocab = Vocabulary.from_instances(instances)
print(vocab.get_index_to_token_vocabulary("tokens"))
print(vocab.get_index_to_token_vocabulary("tags"))

100%|██████████| 1/1 [00:00<00:00, 9576.04it/s]

{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'Barack', 3: 'Obama', 4: 'is', 5: 'a', 6: 'great', 7: 'guy', 8: '.'}
{0: 'O', 1: 'PERSON'}



