In [59]:
import collections
import re
# SPACE_TOKEN = "_"       # 用下划线表示空格符
END_TOKEN   = r'</w>'   # 用</w>表示单词结束

In [60]:
# 定义个模拟语料库
corpus = ["I am obama.", "I am banana!", "I like banana?"]


In [61]:
# 获取预料库中的单词统计信息
word_freq = collections.defaultdict(int)
for sentence in corpus:
    words = sentence.strip().split()
    for word in words:
        word_freq[" ".join(list(word))+ " " + END_TOKEN] += 1
print(f"word_freq:{word_freq}")

word_freq:defaultdict(<class 'int'>, {'I </w>': 3, 'a m </w>': 2, 'o b a m a . </w>': 1, 'b a n a n a ! </w>': 1, 'l i k e </w>': 1, 'b a n a n a ? </w>': 1})


In [62]:

# 获取初始词表字符对统计信息
vocab    = collections.defaultdict(int)
pair_freq       = collections.defaultdict(int)
for word, freq in word_freq.items():
    chars   = word.split()
    length  = len(chars)
    # 字符统计
    for char in chars:
        vocab[char] += freq
    # 字符对统计
    for i in range(len(chars)-1):
        pair_freq[(chars[i], chars[i+1])] += freq
print(f"vocab:{vocab}")
print(f"pair_freq:{pair_freq}")

vocab:defaultdict(<class 'int'>, {'I': 1, '</w>': 6, 'a': 9, 'm': 2, 'o': 1, 'b': 3, '.': 1, 'n': 4, '!': 1, 'l': 1, 'i': 1, 'k': 1, 'e': 1, '?': 1})
pair_freq:defaultdict(<class 'int'>, {('I', '</w>'): 3, ('a', 'm'): 3, ('m', '</w>'): 2, ('o', 'b'): 1, ('b', 'a'): 3, ('m', 'a'): 1, ('a', '.'): 1, ('.', '</w>'): 1, ('a', 'n'): 4, ('n', 'a'): 4, ('a', '!'): 1, ('!', '</w>'): 1, ('l', 'i'): 1, ('i', 'k'): 1, ('k', 'e'): 1, ('e', '</w>'): 1, ('a', '?'): 1, ('?', '</w>'): 1})


In [63]:

# 获取最大频数字符对
pair_with_max_freq = max(pair_freq, key=pair_freq.get)
print(f"pair_with_max_freq:{pair_with_max_freq}")

# 融合最大频数字符对
merged_pair = re.escape(' '.join(pair_with_max_freq))
print(f"merged_pair:{merged_pair}")

# 定义正则表达式
match_re = re.compile(r'(?<!\S)' + merged_pair + r'(?!\S)') # 目标pair左右两边得是空字符，如空格等

# 将word_freq中满足pair_with_max_freq字符对合并
new_word_freq = collections.defaultdict(int)
for key in word_freq:
    new_key = match_re.sub(''.join(pair_with_max_freq), key) # 满足则替换
    new_word_freq[new_key] = word_freq[key]
print(new_word_freq)


pair_with_max_freq:('a', 'n')
merged_pair:a\ n
defaultdict(<class 'int'>, {'I </w>': 3, 'a m </w>': 2, 'o b a m a . </w>': 1, 'b an an a ! </w>': 1, 'l i k e </w>': 1, 'b an an a ? </w>': 1})


In [64]:
# 更新词表
vocab[''.join(pair_with_max_freq)] = pair_freq[pair_with_max_freq]
vocab[pair_with_max_freq[0]] -= pair_freq[pair_with_max_freq]
vocab[pair_with_max_freq[1]] -= pair_freq[pair_with_max_freq]
print(vocab)

defaultdict(<class 'int'>, {'I': 1, '</w>': 6, 'a': 5, 'm': 2, 'o': 1, 'b': 3, '.': 1, 'n': 0, '!': 1, 'l': 1, 'i': 1, 'k': 1, 'e': 1, '?': 1, 'an': 4})
