In [85]:
# 将符号词表初始化为所有英文小写字符、特殊的词尾符号'_'和特殊的未知符号'[UNK]

import collections
symbols = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
           '_', '[UNK]']

# 字典raw_token_freqs将词映射到数据集中的频率（出现次数）
raw_token_freqs = {'fast_':4,'faster_':3,'tall_':5,'taller_':4}
token_freqs = {}
for token, freq in raw_token_freqs.items():
    token_freqs[' '.join(list(token))] = raw_token_freqs[token]
token_freqs

{'f a s t _': 4, 'f a s t e r _': 3, 't a l l _': 5, 't a l l e r _': 4}

In [86]:
def get_max_freq_pair(token_freqs):
    ''' 
    返回词内最频繁的连续符号对，其中词来自输入词典token_freqs的键
    '''
    # 类似dictionary对象，values都是int的实例，
    #不存在的key也有一个默认值，不会报错，为int()的默认值0.
    pairs = collections.defaultdict(int)
    #for token, freq in token_freqs.items():
    for token,freq in token_freqs.items():
        symbols = token.split()
        for i in range(len(symbols)-1):
            # “pairs”的键是两个连续符号的元组
            pairs[symbols[i], symbols[i + 1]] += freq
    # 具有最大值的“pairs”键
    return max(pairs,key=pairs.get)#get返回指定键的值,
get_max_freq_pair(token_freqs)

('t', 'a')

In [87]:
def merge_symbols(max_freq_pair, token_freqs,symbols):
    '''
    合并最频繁的连续符号对以产生新符号。
    '''
    symbols.append(''.join(max_freq_pair))#将max_freq_pair拼接连续字符串
    new_token_freqs = dict()
    for token,freq in token_freqs.items():
        #用‘ta’取代't a'
        new_token = token.replace(' '.join(max_freq_pair),
                                 ''.join(max_freq_pair))
        new_token_freqs[new_token] = freq
    return new_token_freqs

num_merges = 10
for i in range(num_merges):
    max_freq_pair = get_max_freq_pair(token_freqs)
    token_freqs = merge_symbols(max_freq_pair,token_freqs,symbols)
    print(f'合并#{i+1}:',max_freq_pair)

合并#1: ('t', 'a')
合并#2: ('ta', 'l')
合并#3: ('tal', 'l')
合并#4: ('f', 'a')
合并#5: ('fa', 's')
合并#6: ('fas', 't')
合并#7: ('e', 'r')
合并#8: ('er', '_')
合并#9: ('tall', '_')
合并#10: ('fast', '_')


In [95]:

print(symbols)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '_', '[UNK]', 'ta', 'tal', 'tall', 'fa', 'fas', 'fast', 'er', 'er_', 'tall_', 'fast_']


In [89]:
# 数据集中的每个词现在被子词“fast_”“fast”“er_”“tall_”和“tall”分割
print(list(token_freqs.keys()))

['fast_', 'fast er_', 'tall_', 'tall er_']


In [90]:
token_freqs

{'fast_': 4, 'fast er_': 3, 'tall_': 5, 'tall er_': 4}

In [100]:
def segment_BPE(tokens, symbols):
    '''
    将单词从输入参数symbols分成可能最长的子词。
    '''
    outputs = []
    for token in tokens:
        start, end = 0, len(token)
        cur_output = []
        # 具有symbols中可能最长子字的词元段
        while start < len(token) and start < end:
            if token[start:end] in symbols:
                cur_output.append(token[start:end])
                #不断裁剪，当前面部分满足时，对后半部分进行裁剪
                start = end
                end = len(token)
            else:
                end -= 1
        if start < len(token):
            cur_output.append('[UNK]')
        outputs.append(' '.join(cur_output))    
    return outputs

In [101]:
tokens = ['tallest_', 'fatter_']
print(segment_BPE(tokens, symbols))

['tall e s t _', 'fa t t er_']


# 加载预训练词向量

In [104]:
import os
import torch
from torch import nn
from d2l import torch as d2l

In [105]:
#@save
d2l.DATA_HUB['glove.6b.50d'] = (d2l.DATA_URL + 'glove.6B.50d.zip',
                                '0b8703943ccdb6eb788e6f091b8946e82231bc4d')

#@save
d2l.DATA_HUB['glove.6b.100d'] = (d2l.DATA_URL + 'glove.6B.100d.zip',
                                 'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a')

#@save
d2l.DATA_HUB['glove.42b.300d'] = (d2l.DATA_URL + 'glove.42B.300d.zip',
                                  'b5116e234e9eb9076672cfeabf5469f3eec904fa')

#@save
d2l.DATA_HUB['wiki.en'] = (d2l.DATA_URL + 'wiki.en.zip',
                           'c1816da3821ae9f43899be655002f6c723e91b88')

In [110]:
class TokenEmbedding:
    """GloVe嵌入"""
    def __init__(self, embedding_name):
        self.idx_to_token,self.idx_to_vec = self._load_embedding(
            embedding_name)
        self.unknown_idx = 0
        self.token_to_idx = {token:idx for idx, token 
                             in enumerate(self.idx_to_token)}
        
    def _load_embedding(self,embedding_name):
        idx_to_token, idx_to_vec = ['<unk>'], []
        data_dir = d2l.download_extract(embedding_name)#..\data\glove.6B.50d
        # GloVe网站：https://nlp.stanford.edu/projects/glove/
        # fastText网站：https://fasttext.cc/
        with open(os.path.join(data_dir,'vec.txt'),'r',encoding='utf-8') as f:
            for line in f:
                elems = line.rstrip().split(' ')#rstrip字符串末尾的指定字符，返回值为word,vec
                token,elems = elems[0], [float(elem) for elem in elems[1:]]
                # 跳过标题信息，例如fastText中的首行
                if len(elems) > 1:
                    idx_to_token.append(token)
                    idx_to_vec.append(elems)
        # idx_to_token中包含'<unk>'，令其对应的vec全为0
        idx_to_vec = [[0]*len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, torch.tensor(idx_to_vec)
    
    def __getitem__(self,tokens):
        '''
        根据tokens返回对于的词向量vec
        tokens为单词列表，如果传入单个单词，则获取的每个字母的idx
        '''
        indices = [self.token_to_idx.get(token,self.unknown_idx)
                   for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs
    
    def __len__(self):
        return len(self.idx_to_token)

In [111]:
glove_6b50d = TokenEmbedding('glove.6b.50d')

In [124]:
idx_to_token,idx_to_vec = glove_6b50d._load_embedding('glove.6b.50d')
len(idx_to_token),len(idx_to_vec)

(400001, 400001)

In [129]:
idx = glove_6b50d.token_to_idx['month']
word = glove_6b50d.idx_to_token[idx]
print(f'idx：{idx}对应的word为{word}')

idx：230对应的word为month


# 应用预训练词向量
## 1.词相似度

In [197]:
def knn(W, x, k):
    '''
    根据词向量之间的余弦相似性为输入词查找语义相似的词
    '''
    # 增加1e-9以获得数值稳定性
    cos = torch.mm(W[1:,:], x.reshape(-1,1))/(torch.sqrt(torch.sum(W[1:,:]*W[1:,:],axis=1)+1e-9)*torch.norm(x))
    _,topk = torch.topk(cos,k=k)#对数据进行降序排列，返回前k个值以及索引
    return topk, [cos[int(i)] for i in topk]

def get_similar_tokens(query_token, k ,embed):
    '''
    使用TokenEmbedding的实例embed中预训练好的词向量来搜索相似的词
    '''
    topk, cos = knn(embed.idx_to_vec, embed[[query_token]],k+1)
    for i,c in zpi(topk[1:], cos[1:]):
        print(f'{embed.idx_to_token[int(i)]}:cosine相似度={float(c):.3f}') 

In [199]:
get_similar_tokens('beatiful',3,glove_6b50d)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:75] data. DefaultCPUAllocator: not enough memory: you tried to allocate 640000000000 bytes. Buy new RAM!

In [198]:
embed = glove_6b50d
W = embed.idx_to_vec

x = embed[['beautiful']].reshape(-1,1)
torch.mm(W,x)/torch.sqrt(torch.sum(W*W,dim=1)*torch.sum(x*x)+1e-9)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:75] data. DefaultCPUAllocator: not enough memory: you tried to allocate 640003200004 bytes. Buy new RAM!

In [200]:
cos = torch.mm(W[1:,:], x.reshape(-1,1))

In [202]:
a = (torch.sqrt(torch.sum(W[1:,:]*W[1:,:],axis=1)+1e-9)*torch.norm(x))

In [204]:
cos.shape

torch.Size([400000, 1])

In [205]:
a.shape

torch.Size([400000])