In [1]:
import collections
import re

In [6]:
example_corpus = "low "*5 + "lowest "*2 + "newer "*6 + "wider "*3 + "new "*2
example_corpus

'low low low low low lowest lowest newer newer newer newer newer newer wider wider wider new new '

### Byte-Pair Encoding

This is an algorithm for subword tokenization. It has two parts, a `token learner` and a `token segmenter`. The learner generates a vocabulary of subword type and the segmenter converts a test sentence into a sequence of subword tokens using the learned vocabulary.

To learn the vocabulary, we perform the following steps:

1) Given the input corpus of text (which is a single string), split across all whitespaces to get a set of strings. Then append a special end-of-word character "_" to each string. We define each of these strings to be a "word".
2) Count the frequency of all the unique words. Split each word into a list of its individual characters, which are the tokens.
3) Initialize the vocabulary as the set of all unique characters across these words, including the "_" character.
4) Find the pair of adjacent tokens that appear most frequently (in case of ties, just break in some consistent way). Merge the pair of tokens into a single new token, add this new token to the vocabulary and replace all occuracnces of this pair across all words with this new token.
5) Repeat step 4 until the vocabulary reaches some specified size.   

In addition to the learned vocabulary of all merged tokens, we also keep track of all the pairs that have been merged. Then we can segment a test corpus by first splitting it into individual characters (replacing all whitespaces with the "_" end of word character). Then we replace all occurances of adjacent tokens from the first merged pair with the merged pair token. Then we repeat this for the second pair and so on.


In [126]:
""" 
    Define some helper functions that will be used by the token learner
"""

# splits corpus into words, returns a dictionary of word frequencies
def init_vocab_words(corpus, eow_token="_"):
    vocab = list(set(list("".join(corpus.split())))) + [eow_token]
    # split corpus across whitespaces and make all characters lower case
    words = corpus.lower().strip().split()
    # get word frequencies
    word_freq = collections.defaultdict(int)
    for word in words:
        # store the word key as a string containing tokens separated by whitespace
        word_key = " ".join(list(word+eow_token))
        word_freq[word_key] += 1

    return vocab, word_freq


# find all unique pairs of adjacent tokens and their counts
def get_pairs(word_freq):
    pairs = collections.Counter()
    for word, freq in word_freq.items():
        tokens = word.split()
        for pair in zip(tokens[:-1], tokens[1:]):
            pairs[pair] += freq 
    return pairs


# merge a pair of tokens (token learner)
def merge_pair_learner(pair, word_freq):
    word_freq_new = collections.defaultdict(int)
    pattern = " ".join(pair)
    pattern_joined = "".join(pair) 
    if not pair[1].endswith("_"):
        pattern = pattern + " "
        pattern_joined = pattern_joined + " "

    for word, freq in word_freq.items():
        # merge all occurances of the pair in every word
        word_new = re.sub(pattern, pattern_joined, word)
        word_freq_new[word_new] = freq
    return word_freq_new

"""
    BPE token learner 
"""
def bpe_learn(corpus, k=100):
    
    # get inital vocab, word and token pair frequencies
    vocab, word_freq = init_vocab_words(corpus)
    #print(f"word_freq: {word_freq}")

    # performs mergers to learn the vocabulary
    merged_pairs = []
    for _ in range(k):
        # get counts of all adjacent token pairs
        pairs = get_pairs(word_freq)
        if not pairs:
            break
        # get most frequent pair
        most_freq_pair = pairs.most_common(1)[0][0]
        #print(f"best pair: {most_freq_pair}")

        # apply merger
        word_freq = merge_pair_learner(most_freq_pair, word_freq)
        #print(f"word_freq after merge: {word_freq}")

        # add merged token to vocab
        vocab.append("".join(most_freq_pair))
        merged_pairs.append(most_freq_pair)

    # precompute tokenized words
    word_tokens = collections.defaultdict(list)
    corpus_words = corpus.lower().strip().split()
    for word in corpus_words:
        word_tokens[word] = tokenize_word(word, vocab, merged_pairs)   

    #print(f"merged pairs: {merged_pairs}")    
    #print(f"final vocab: {vocab}")    
    #print(f"precomputed tokenizations: {word_tokens}")

    return vocab, merged_pairs, word_tokens    
    

# merge a pair of tokens (token segmenter)
def merge_pair_segmenter(pair, corpus_tokens):
    pattern = " ".join(pair)
    pattern_joined = "".join(pair) 
    if not pair[1].endswith("_"):
        pattern = pattern + " "
        pattern_joined = pattern_joined + " "
    # merge all occurances of the pair in every word
    corpus_tokens_new = re.sub(pattern, pattern_joined, corpus_tokens)
    return corpus_tokens_new


# encode a word into a list of BPE tokens
def tokenize_word(word, vocab, pairs):
    # split word into individual characters separated by white spaces, also insert eow token
    word_tokens = " ".join(list(word)) + " _"
    # now replace all occurances of pairs with the merged pair
    for pair in pairs:
        word_tokens =  merge_pair_segmenter(pair, word_tokens)
        if len(word_tokens.split()) == 1:
            break
    
    word_tokens = word_tokens.split()
    return word_tokens

""" 
    BPE token segmenter
"""    
def bpe_segment(test_corpus, vocab, pairs, precomputed_word_tokens):
    # split corpus into words
    corpus_words = test_corpus.split()
    # now replace all occurances of pairs with the merged pair
    #print(f"Corpus words: \n{corpus_words}")
    for word in corpus_words:
        if word in precomputed_word_tokens:
            word_tokens = precomputed_word_tokens[word]
        else:    
            word_tokens = tokenize_word(word, vocab, pairs)
        print(f"{word} ---> {word_tokens}")

    #return corpus_tokens    

In [127]:
vocab, merged_pairs, word_tokens  = bpe_learn(example_corpus)

In [128]:
s = "Yes, they said we need to lower it."
" ".join(list("_".join(s.split())+"_"))

'Y e s , _ t h e y _ s a i d _ w e _ n e e d _ t o _ l o w e r _ i t . _'

In [129]:
tokenize_word("lower", vocab, merged_pairs)

['low', 'er_']

In [130]:
#bpe_segment(s, vocab, merged_pairs)
bpe_segment(s, vocab, merged_pairs, word_tokens)

Yes, ---> ['Y', 'e', 's', ',', '_']
they ---> ['t', 'h', 'e', 'y', '_']
said ---> ['s', 'a', 'i', 'd', '_']
we ---> ['w', 'e', '_']
need ---> ['ne', 'e', 'd', '_']
to ---> ['t', 'o', '_']
lower ---> ['low', 'er_']
it. ---> ['i', 't', '.', '_']
