## constants.py
https://github.com/joeynmt/joeynmt/blob/master/joeynmt/constants.py

In [1]:
"""
Defining global constants
"""

UNK_TOKEN = '<unk>'
PAD_TOKEN = '<pad>'
BOS_TOKEN = '<s>'
EOS_TOKEN = '</s>'

DEFAULT_UNK_ID = lambda: 0

In [4]:
DEFAULT_UNK_ID()

0

## vocabulary.py
https://github.com/joeynmt/joeynmt/blob/master/joeynmt/vocabulary.py

In [5]:
from collections import defaultdict,Counter
from typing import List
import numpy as np
from torchtext.data import Dataset

In [249]:
# coding: utf-8

"""
Vocabulary module
"""

class Vocabulary:
    '''Vocabulary represents mapping between tokens and indices. Vocabulary代表着tokens（标记）和indices（索引）之间的映射'''
    def __init__(self, tokens:List[str]=None, file:str=None, encoding='utf-8',lower=False) ->None:
        '''
        Create vocabulary from list of tokens or file.从标记列表或文件来创建词表
        Special tokens are added if not already in file or list.加入特殊标记，即使这些特殊标记没有出现在标记列表或文件中
        File format: token with index i is in line i. 文件格式：索引为i的标记，对应文件中的第i行
        :param tokens: list of tokens 标记列表
        :param file: file to load vocabulary from 文件路径，将从该文件中载入词表
        '''
        # don't rename stoi and itos since needed for torchtext
        # warning: stoi grows with unknown tokens, don't use for saving or size
        
        # special symbols
        self.specials = [UNK_TOKEN, PAD_TOKEN, BOS_TOKEN, EOS_TOKEN]
        self.stoi = defaultdict(DEFAULT_UNK_ID)
        self.itos = []
        if tokens is not None:
            self._from_list(tokens)
        if file is not None:
            self._from_file(file,encoding = encoding,lower = lower)
    def add_tokens(self, tokens:List[str],lower=False) ->None:
        """
        Add list of tokens to vocabulary
        :param tokens: list of tokens to add to the vocabulary
        """
        for token in tokens:
            new_index=len(self.itos)
            if token not in self.itos:
                token=token.strip()
                if lower:
                    token=token.lower()
                self.itos.append(token)
                self.stoi[token]=new_index
        
    def _from_list(self, tokens: List[str] = None,lower=False) -> None:
        """
        Make vocabulary from list of tokens.
        Tokens are assumed to be unique and pre-selected.
        Special symbols are added if not in list.
        :param tokens: list of tokens
        """
        self.add_tokens(tokens=self.specials+tokens,lower=lower)
        assert len(self.stoi) == len(self.itos)
        
    def _from_file(self, file: str,encoding='utf-8',lower=False) -> None:
        """
        Make vocabulary from contents of file.
        File format: token with index i is in line i.
        :param file: path to file where the vocabulary is loaded from
        """
        tokens = []
        with open(file, "r",encoding=encoding) as open_file:
            for line in open_file:
                tokens.append(line.strip("\n"))
        self._from_list(tokens,lower=lower)
    
    def __str__(self) -> str:
        return self.stoi.__str__()
    
    def to_file(self, file: str,encoding='utf-8') -> None:
        """
        Save the vocabulary to a file, by writing token with index i in line i.
        :param file: path to file where the vocabulary is written
        """
        with open(file, "w",encoding=encoding) as open_file:
            for t in self.itos:
                open_file.write("{}\n".format(t))
    
    def is_unk(self, token: str) -> bool:
        """
        Check whether a token is covered by the vocabulary
        :param token:
        :return: True if covered, False otherwise
        """
        return self.stoi[token] == DEFAULT_UNK_ID()

    def __len__(self) -> int:
        return len(self.itos)
    
    def _array_to_sentence(self, array: np.array, cut_at_eos=True, skip_pad=True) -> List[str]:
        """
        1D array --> sentence
        
        Converts an array of IDs to a sentence, optionally cutting the result
        off at the end-of-sequence token.
        :param array: 1D array containing indices
        :param cut_at_eos: cut the decoded sentences at the first <eos>
        :param skip_pad: skip generated <pad> tokens
        :return: list of strings (tokens)
        """
        sentence = []
        for i in array:
            s = self.itos[i]
            if cut_at_eos and s == EOS_TOKEN:
                break
            if skip_pad and s == PAD_TOKEN:
                continue
            sentence.append(s)
        return sentence
    
    def _arrays_to_sentences(self, arrays: np.array, cut_at_eos=True,skip_pad=True) -> List[List[str]]:
        """
        2D array --> sentences
        
        Convert multiple arrays containing sequences of token IDs to their
        sentences, optionally cutting them off at the end-of-sequence token.
        :param arrays: 2D array containing indices
        :param cut_at_eos: cut the decoded sentences at the first <eos>
        :param skip_pad: skip generated <pad> tokens
        :return: list of list of strings (tokens)
        """
        sentences = []
        for array in arrays:
            sentences.append(self._array_to_sentence(array=array, cut_at_eos=cut_at_eos,skip_pad=skip_pad))
        return sentences
    
    def array_to_sentence(self, array: np.array, cut_at_eos=True,skip_pad=True) -> List[List[str]]:
        dim=len(array.shape)
        if dim==1:
            return self._array_to_sentence(array, cut_at_eos, skip_pad)
        if dim==2:
            return self._arrays_to_sentences(array, cut_at_eos, skip_pad)
        else:
            raise ValueError(f"要求参数array必须是1D或2D的，但现在输入的参数array是{dim}D的！")
    def arrays_to_sentences(self, array: np.array, cut_at_eos=True,skip_pad=True) -> List[List[str]]:
        return self.array_to_sentence(array,cut_at_eos,skip_pad)
            
    def build_vocab(field: str, max_size: int, min_freq: int=0, dataset: Dataset = None , vocab_file: str = None, encoding='utf-8',lower=False) -> Vocabulary:
        """
        Builds vocabulary for a torchtext `field` from given`dataset` or
        `vocab_file`.
        :param field: attribute e.g. "src"
        :param max_size: maximum size of vocabulary
        :param min_freq: minimum frequency for an item to be included
        :param dataset: dataset to load data for field from
        :param vocab_file: file to store the vocabulary,
            if not None, load vocabulary from here
        :return: Vocabulary created from either `dataset` or `vocab_file`
        """

        if vocab_file is not None:
            # load it from file
            vocab = Vocabulary(file=vocab_file, encoding=encoding, lower=lower)
        elif dataset is not None:
            # create newly
            def filter_min(counter: Counter, min_freq: int):
                """ Filter counter by min frequency """
                filtered_counter = Counter({t: c for t, c in counter.items() if c >= min_freq})
                return filtered_counter

            def sort_and_cut(counter: Counter, limit: int):
                """ Cut counter to most frequent,
                sorted numerically and alphabetically"""
                # sort by frequency, then alphabetically
                tokens_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
                tokens_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
                vocab_tokens = [i[0] for i in tokens_and_frequencies[:limit]]
                return vocab_tokens

            tokens = []
            for i in dataset.examples:
                if field == "src":
                    tokens.extend(i.src)
                elif field == "trg":
                    tokens.extend(i.trg)

            counter = Counter(tokens)
            if min_freq > 0:
                counter = filter_min(counter, min_freq)
            vocab_tokens = sort_and_cut(counter, max_size)
            assert len(vocab_tokens) <= max_size

            vocab = Vocabulary(tokens=vocab_tokens)
            assert len(vocab) <= max_size + len(vocab.specials)
            assert vocab.itos[DEFAULT_UNK_ID()] == UNK_TOKEN
            
        else:
            raise ValueError(f"要求参数dataset或者vocab_file 至少有一个不为空！")

        # check for all except for UNK token whether they are OOVs
        for s in vocab.specials[1:]:
            assert not vocab.is_unk(s)

        return vocab  
        

In [250]:
vocab=Vocabulary()
vocab2=Vocabulary()

In [251]:
vocab._from_file('untitled.txt')
vocab2._from_file('untitled.txt',lower=True)

In [252]:
vocab.build_vocab(max_size=3)

ValueError: 要求参数dataset或者vocab_file 至少有一个不为空！

In [253]:
v=vocab.build_vocab(max_size=3,vocab_file='untitled.txt', encoding='utf-8',lower=False)
v.itos

['<unk>',
 '<pad>',
 '<s>',
 '</s>',
 'Make',
 'vocabulary',
 'from',
 'list',
 'of',
 'tokens']

In [254]:
v=vocab.build_vocab(max_size=3,vocab_file='untitled.txt')
v.itos

['<unk>',
 '<pad>',
 '<s>',
 '</s>',
 'Make',
 'vocabulary',
 'from',
 'list',
 'of',
 'tokens']

In [255]:
array=np.array([1,2,3])
arrays=np.array([[1,2,3],[1,2,0]])
array_3d=np.array([[[1,2,3],[1,2,0]],[[1,2,3],[1,2,0]]])
array_0d=np.array([])

In [256]:
array_3d.shape,array_0d.shape

((2, 2, 3), (0,))

In [257]:
array.shape,arrays.shape,array_3d.shape

((3,), (2, 3), (2, 2, 3))

In [258]:
len(array.shape),len(arrays.shape)

(1, 2)

In [259]:
s1=vocab.array_to_sentence(array)
s2=vocab.arrays_to_sentences(array)
s1,s2

(['<s>'], ['<s>'])

In [260]:
s1=vocab.array_to_sentence(arrays)
s2=vocab.arrays_to_sentences(arrays)
s1,s2

([['<s>'], ['<s>', '<unk>']], [['<s>'], ['<s>', '<unk>']])

In [262]:
vocab.arrays_to_sentences(array, cut_at_eos=False,skip_pad=False)

['<pad>', '<s>', '</s>']

In [263]:
vocab.array_to_sentence(array, cut_at_eos=False,skip_pad=False)

['<pad>', '<s>', '</s>']

In [264]:
ss=vocab.arrays_to_sentences(arrays, cut_at_eos=False,skip_pad=False)
ss

[['<pad>', '<s>', '</s>'], ['<pad>', '<s>', '<unk>']]

In [266]:
vocab.array_to_sentence(arrays, cut_at_eos=False,skip_pad=False)

[['<pad>', '<s>', '</s>'], ['<pad>', '<s>', '<unk>']]

In [267]:
vocab.array_to_sentence(array_3d, cut_at_eos=False,skip_pad=False)

ValueError: 要求参数array必须是1D或2D的，但现在输入的参数array是3D的！

In [268]:
vocab.arrays_to_sentences(array_3d, cut_at_eos=False,skip_pad=False)

ValueError: 要求参数array必须是1D或2D的，但现在输入的参数array是3D的！

In [269]:
vocab.arrays_to_sentences(array_0d, cut_at_eos=False,skip_pad=False)

[]

In [270]:
vocab.array_to_sentence(array_0d, cut_at_eos=False,skip_pad=False)

[]

In [208]:
str(vocab2)

"defaultdict(<function <lambda> at 0x7fb54d0fadd0>, {'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, 'make': 4, 'vocabulary': 5, 'from': 6, 'list': 7, 'of': 8, 'tokens': 9})"

In [209]:
vocab.stoi

defaultdict(<function __main__.<lambda>()>,
            {'<unk>': 0,
             '<pad>': 1,
             '<s>': 2,
             '</s>': 3,
             'Make': 4,
             'vocabulary': 5,
             'from': 6,
             'list': 7,
             'of': 8,
             'tokens': 9})

In [210]:
vocab.itos,vocab2.itos

(['<unk>',
  '<pad>',
  '<s>',
  '</s>',
  'Make',
  'vocabulary',
  'from',
  'list',
  'of',
  'tokens'],
 ['<unk>',
  '<pad>',
  '<s>',
  '</s>',
  'make',
  'vocabulary',
  'from',
  'list',
  'of',
  'tokens'])

In [211]:
vocab.add_tokens(vocab.specials)

In [212]:
vocab.itos,

(['<unk>',
  '<pad>',
  '<s>',
  '</s>',
  'Make',
  'vocabulary',
  'from',
  'list',
  'of',
  'tokens'],)

In [213]:
vocab.stoi['<unk>'],vocab.itos[2]

(0, '<s>')

In [214]:
vocab.stoi

defaultdict(<function __main__.<lambda>()>,
            {'<unk>': 0,
             '<pad>': 1,
             '<s>': 2,
             '</s>': 3,
             'Make': 4,
             'vocabulary': 5,
             'from': 6,
             'list': 7,
             'of': 8,
             'tokens': 9})

In [215]:
specials = [UNK_TOKEN, PAD_TOKEN, BOS_TOKEN, EOS_TOKEN]
stoi = defaultdict(DEFAULT_UNK_ID)

In [216]:
specials

['<unk>', '<pad>', '<s>', '</s>']

In [217]:
stoi['3']

0

In [218]:
# from collections import defaultdict

dict1 = defaultdict(int)
dict2 = defaultdict(set)
dict3 = defaultdict(str)
dict4 = defaultdict(list)
dict1[2] ='two'

print(dict1[2])
print(dict1[1])
print(dict2[1])
print(dict3[1])
print(dict4[1])

two
0
set()

[]


In [219]:
dict1

defaultdict(int, {2: 'two', 1: 0})