# 分词

基本思路：构建一个词表，通过词表一一映射，进行分词。

## WordPiece

$$
{\rm score} = \frac{N_{{\rm pair}<{\rm c}_1, {\rm c}_2>}}{N_{{\rm c}_1} \times N_{{\rm c}_2}}
$$

In [2]:
sentences = [
    "我",
    "喜欢",
    "吃",
    "苹果",
    "他",
    "不",
    "喜欢",
    "吃",
    "苹果派",
    "I like to eat apples",
    "She has a cute cat",
    "you are very cute",
    "give you a hug"
]


def get_stats(sentences: list[str]) -> dict:
    stats = {}

    for sentence in sentences:
        symbols = sentence.split()
        for symbol in symbols:
            stats[symbol] = 1 if symbol not in stats else stats[symbol] + 1
            
    return stats

stats = get_stats(sentences)
print(f"stats: {stats}")


def get_alphabet(stats: dict) -> list[chr]:
    alphabet = []
    for word in stats.keys():
        for letter in word:
            if letter not in alphabet:
                alphabet.append(letter)
    alphabet.sort()
    return alphabet

print(f"alphabet: {get_alphabet(stats)}")

def get_splits(stats: dict) -> dict:
    return {
        word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
        for word in stats.keys()
    }

splits = get_splits(stats) 
print(f"splits: {splits}")


stats: {'我': 1, '喜欢': 2, '吃': 2, '苹果': 1, '他': 1, '不': 1, '苹果派': 1, 'I': 1, 'like': 1, 'to': 1, 'eat': 1, 'apples': 1, 'She': 1, 'has': 1, 'a': 2, 'cute': 2, 'cat': 1, 'you': 2, 'are': 1, 'very': 1, 'give': 1, 'hug': 1}
alphabet: ['I', 'S', 'a', 'c', 'e', 'g', 'h', 'i', 'k', 'l', 'o', 'p', 'r', 's', 't', 'u', 'v', 'y', '不', '他', '吃', '喜', '我', '果', '欢', '派', '苹']
splits: {'我': ['我'], '喜欢': ['喜', '##欢'], '吃': ['吃'], '苹果': ['苹', '##果'], '他': ['他'], '不': ['不'], '苹果派': ['苹', '##果', '##派'], 'I': ['I'], 'like': ['l', '##i', '##k', '##e'], 'to': ['t', '##o'], 'eat': ['e', '##a', '##t'], 'apples': ['a', '##p', '##p', '##l', '##e', '##s'], 'She': ['S', '##h', '##e'], 'has': ['h', '##a', '##s'], 'a': ['a'], 'cute': ['c', '##u', '##t', '##e'], 'cat': ['c', '##a', '##t'], 'you': ['y', '##o', '##u'], 'are': ['a', '##r', '##e'], 'very': ['v', '##e', '##r', '##y'], 'give': ['g', '##i', '##v', '##e'], 'hug': ['h', '##u', '##g']}


In [4]:
char_freq = {}
pair_freq = {}

for word, freq in stats.items():
    for c in word:
        char_freq[c] = char_freq[c] + freq if c in char_freq else freq

    l = len(word)
    if l == 1:
        continue
    for i in range(l - 1):
        pair = (word[i], word[i+1])
        pair_freq[pair] = pair_freq[pair] + freq if pair in pair_freq else freq

for pair in pair_freq.keys():
    pair_freq[pair] = pair_freq[pair] / (char_freq[pair[0]] * char_freq[pair[1]])
    
print(char_freq)
print(pair_freq)

{'我': 1, '喜': 2, '欢': 2, '吃': 2, '苹': 2, '果': 2, '他': 1, '不': 1, '派': 1, 'I': 1, 'l': 2, 'i': 2, 'k': 1, 'e': 9, 't': 5, 'o': 3, 'a': 7, 'p': 2, 's': 2, 'S': 1, 'h': 3, 'c': 3, 'u': 5, 'y': 3, 'r': 2, 'v': 2, 'g': 2}
{('喜', '欢'): 0.5, ('苹', '果'): 0.5, ('果', '派'): 0.5, ('l', 'i'): 0.25, ('i', 'k'): 0.5, ('k', 'e'): 0.1111111111111111, ('t', 'o'): 0.06666666666666667, ('e', 'a'): 0.015873015873015872, ('a', 't'): 0.05714285714285714, ('a', 'p'): 0.07142857142857142, ('p', 'p'): 0.25, ('p', 'l'): 0.25, ('l', 'e'): 0.05555555555555555, ('e', 's'): 0.05555555555555555, ('S', 'h'): 0.3333333333333333, ('h', 'e'): 0.037037037037037035, ('h', 'a'): 0.047619047619047616, ('a', 's'): 0.07142857142857142, ('c', 'u'): 0.13333333333333333, ('u', 't'): 0.08, ('t', 'e'): 0.044444444444444446, ('c', 'a'): 0.047619047619047616, ('y', 'o'): 0.2222222222222222, ('o', 'u'): 0.13333333333333333, ('a', 'r'): 0.07142857142857142, ('r', 'e'): 0.05555555555555555, ('v', 'e'): 0.1111111111111111, ('e', 'r'): 0.