# [chapter-13] Analyzing and understanding BERT tokenizer output

In [31]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer(vocab_file="./bert-vocab.txt")
enc = tokenizer.encode("The canoes nearly capsized when we were overtaken by the motorboat's wake.")
print(enc.tokens)

['[CLS]', 'the', 'canoe', '##s', 'nearly', 'caps', '##ized', 'when', 'we', 'were', 'over', '##taken', 'by', 'the', 'motor', '##boat', "'", 's', 'wake', '.', '[SEP]']


In [32]:
import pickle
with open('data/outdoors_labels.pickle','rb') as fd:
    labels = pickle.load(fd)

In [33]:
import re
def get_wordpieces(labels):
    pieces = {}
    pattern = re.compile("[a-z\#]+") #ascii only
    for k,v in labels.items():
        enc = tokenizer.encode(v)
        for tok in enc.tokens:
            if tok[0:2]=='##' and pattern.fullmatch(tok):
                if tok not in pieces:
                    pieces[tok]=0
                pieces[tok]+=1
    sorted_pieces = {k: v for k, v in sorted(pieces.items(), key=lambda item: item[1], reverse=True)}
    return sorted_pieces

In [34]:
suffixes = get_wordpieces(labels)

In [35]:
suffixes

{'##xx': 8352,
 '##s': 5667,
 '##r': 2086,
 '##ing': 2044,
 '##t': 1750,
 '##e': 1566,
 '##er': 1562,
 '##n': 1320,
 '##en': 876,
 '##a': 852,
 '##x': 812,
 '##g': 777,
 '##p': 735,
 '##es': 722,
 '##y': 721,
 '##m': 718,
 '##l': 711,
 '##d': 683,
 '##o': 623,
 '##i': 607,
 '##ed': 529,
 '##ch': 489,
 '##ers': 475,
 '##co': 411,
 '##k': 360,
 '##h': 356,
 '##us': 352,
 '##de': 351,
 '##f': 348,
 '##or': 342,
 '##on': 335,
 '##c': 331,
 '##ne': 324,
 '##rs': 319,
 '##el': 316,
 '##te': 302,
 '##na': 295,
 '##ry': 286,
 '##ke': 279,
 '##ck': 271,
 '##ra': 271,
 '##eering': 265,
 '##ta': 264,
 '##re': 256,
 '##ser': 253,
 '##ung': 253,
 '##ness': 247,
 '##ling': 247,
 '##ei': 246,
 '##og': 241,
 '##unt': 240,
 '##ise': 240,
 '##per': 238,
 '##ere': 234,
 '##ter': 232,
 '##b': 228,
 '##ak': 227,
 '##ce': 226,
 '##mo': 225,
 '##tion': 219,
 '##le': 216,
 '##se': 216,
 '##st': 216,
 '##z': 212,
 '##v': 210,
 '##je': 209,
 '##lge': 208,
 '##ts': 207,
 '##lt': 204,
 '##ur': 203,
 '##king': 200