In [2]:
import spacy
# Load spaCy models
try:
    en_nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
    de_nlp = spacy.load('de_core_news_sm', disable=['parser', 'tagger', 'ner'])
except OSError:
    # Download if not present
    import os
    os.system('python -m spacy download en_core_web_sm')
    os.system('python -m spacy download de_core_news_sm')
    en_nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
    de_nlp = spacy.load('de_core_news_sm', disable=['parser', 'tagger', 'ner'])

In [3]:
# Example data
data = [
    {
        'en': 'The cat and dog are playing.',
        'de': 'Die Katze und der Hund spielen.',
        'en_tokens': ['<sos>', 'the', 'cat', 'and', 'dog', 'are', 'playing', '.', '<eos>'],
        'de_tokens': ['<sos>', 'die', 'katze', 'und', 'der', 'hund', 'spielen', '.', '<eos>']
    },
    {
        'en': 'The cat and cat are sleeping.',
        'de': 'Die Katze und Katze schlafen.',
        'en_tokens': ['<sos>', 'the', 'cat', 'and', 'cat', 'are', 'sleeping', '.', '<eos>'],
        'de_tokens': ['<sos>', 'die', 'katze', 'und', 'katze', 'schlafen', '.', '<eos>']
    },
    {
        'en': 'The dog is running.',
        'de': 'Der Hund läuft.',
        'en_tokens': ['<sos>', 'the', 'dog', 'is', 'running', '.', '<eos>'],
        'de_tokens': ['<sos>', 'der', 'hund', 'läuft', '.', '<eos>']
    },
    {
        'en': 'Two young, White males are outside near many bushes.',
        'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
        'en_tokens': ['<sos>', 'two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '<eos>'],
        'de_tokens': ['<sos>', 'zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.', '<eos>']
    },
    {
        'en': 'Two young people are walking near bushes.',
        'de': 'Zwei junge Leute gehen in der Nähe von Büschen.',
        'en_tokens': ['<sos>', 'two', 'young', 'people', 'are', 'walking', 'near', 'bushes', '.', '<eos>'],
        'de_tokens': ['<sos>', 'zwei', 'junge', 'leute', 'gehen', 'in', 'der', 'nähe', 'von', 'büschen', '.', '<eos>']
    },
    {
        'en': 'The young students are outside.',
        'de': 'Die junge Studenten sind im Freien.',
        'en_tokens': ['<sos>', 'the', 'young', 'students', 'are', 'outside', '.', '<eos>'],
        'de_tokens': ['<sos>', 'die', 'junge', 'studenten', 'sind', 'im', 'freien', '.', '<eos>']
    }
]

In [4]:
from collections import Counter
from typing import List, Dict

def build_vocab_from_parallel_data(data_iterator: List[Dict], 
                                 min_freq: int = 2, 
                                 specials: List[str] = ["<unk>", "<pad>", "<sos>", "<eos>"]):
    """Build vocabularies from parallel tokenized data using spaCy"""
    # Initialize counters
    en_counter = Counter()
    de_counter = Counter()
    
    # Count tokens using spaCy tokenization
    for item in data_iterator:
        en_tokens = [token.text.lower() for token in en_nlp(item['en'])]
        de_tokens = [token.text.lower() for token in de_nlp(item['de'])]
        en_counter.update(en_tokens)
        de_counter.update(de_tokens)
    
    def create_vocab(counter):
        vocab = {}
        # Add special tokens
        for i, token in enumerate(specials):
            vocab[token] = i
        
        # Add frequent tokens
        idx = len(specials)
        for token, count in counter.most_common():
            if count >= min_freq and token not in vocab:
                vocab[token] = idx
                idx += 1
        return vocab
    
    en_vocab = create_vocab(en_counter)
    de_vocab = create_vocab(de_counter)
    
    return en_vocab, de_vocab, en_counter, de_counter

en_vocab, de_vocab, en_counter, de_counter = build_vocab_from_parallel_data(data)
print("English vocab size:", len(en_vocab))
print("German vocab size:", len(de_vocab))

English vocab size: 15
German vocab size: 17




In [5]:
# Display english vocab
print("English vocab:", en_vocab)

# Display german vocab
print("English vocab:", de_vocab)

English vocab: {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3, '.': 4, 'are': 5, 'the': 6, 'cat': 7, 'young': 8, 'and': 9, 'dog': 10, 'two': 11, 'outside': 12, 'near': 13, 'bushes': 14}
English vocab: {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3, '.': 4, 'der': 5, 'die': 6, 'katze': 7, 'junge': 8, 'und': 9, 'hund': 10, 'zwei': 11, 'sind': 12, 'im': 13, 'freien': 14, 'in': 15, 'nähe': 16}


- From the above function, we can generalize with `set_default_index` when the given word is unknown (`<unk>`)
- `lookup_indices` will return a list of corresponding indexes, given an input list of string
- `lookup_tokens` will return a list of corresponding vocab words, given an input list of index

In [6]:
from collections import Counter
from typing import List, Dict

class Vocabulary:
    def __init__(self, tokens_to_index):
        self.tokens_to_index = tokens_to_index
        self.index_to_tokens = {v: k for k, v in tokens_to_index.items()}
        self.default_index = None
    
    def __getitem__(self, token):
        """Retrieve index of token"""
        return self.tokens_to_index.get(token, self.default_index)
    
    # We can run `"the" in en_vocab` to check if a token is in the vocabulary
    def __contains__(self, token):
        """Enable membership testing with 'in' operator"""
        return (token in self.tokens_to_index)
    
    def set_default_index(self, index):
        self.default_index = index
        
    def lookup_indices(self, tokens):
        """Convert a list of tokens to indices"""
        return [self.tokens_to_index.get(token, self.default_index) for token in tokens]
    
    def lookup_tokens(self, indices):
        """Convert a list of indices back to tokens"""
        return [self.index_to_tokens.get(idx, self.index_to_tokens[self.default_index]) 
                for idx in indices]
    
    def __len__(self):
        return len(self.tokens_to_index)

def build_vocab_from_parallel_data(data_iterator: List[Dict], 
                                 min_freq: int = 2, 
                                 specials: List[str] = ["<unk>", "<pad>", "<sos>", "<eos>"]):
    """Build vocabularies from parallel tokenized data using spaCy"""
    en_counter = Counter()
    de_counter = Counter()
    
    for item in data_iterator:
        en_tokens = [token.text.lower() for token in en_nlp(item['en'])]
        de_tokens = [token.text.lower() for token in de_nlp(item['de'])]
        en_counter.update(en_tokens)
        de_counter.update(de_tokens)
    
    def create_vocab(counter):
        tokens_to_index = {}
        for i, token in enumerate(specials):
            tokens_to_index[token] = i
        
        idx = len(specials)
        for token, count in counter.most_common():
            if count >= min_freq and token not in tokens_to_index:
                tokens_to_index[token] = idx
                idx += 1
                
        vocab = Vocabulary(tokens_to_index)
        vocab.set_default_index(tokens_to_index["<unk>"])
        return vocab
    
    en_vocab = create_vocab(en_counter)
    de_vocab = create_vocab(de_counter)
    
    return en_vocab, de_vocab, en_counter, de_counter

# Build vocabularies
en_vocab, de_vocab, en_counter, de_counter = build_vocab_from_parallel_data(data)

# Test vocabulary
print("English vocab :", en_vocab.index_to_tokens)
print("German vocab size:", de_vocab.index_to_tokens)

English vocab : {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>', 4: '.', 5: 'are', 6: 'the', 7: 'cat', 8: 'young', 9: 'and', 10: 'dog', 11: 'two', 12: 'outside', 13: 'near', 14: 'bushes'}
German vocab size: {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>', 4: '.', 5: 'der', 6: 'die', 7: 'katze', 8: 'junge', 9: 'und', 10: 'hund', 11: 'zwei', 12: 'sind', 13: 'im', 14: 'freien', 15: 'in', 16: 'nähe'}


In [7]:
en_vocab["two"]

11

In [8]:
en_vocab["unknown word"]

0

In [9]:
# Testing lookup_indices, upper-case 'Two' is considered as unknown word
# We may handle lower case directly in the build_vocab_from_parallel_data function, but we assume the input is already lower-cased
indices = en_vocab.lookup_indices(['Two', 'young', 'unknownword'])
indices

[0, 8, 0]

In [10]:
# Testing lookup_tokens
tokens = en_vocab.lookup_tokens(indices)
tokens

['<unk>', 'young', '<unk>']

In [11]:
# Testing __contains__
"the" in en_vocab

True