In [None]:
# learning notes of 深度学习进阶：自然语言处理
# ch4： word2vec的加速

In [11]:
# Embedding layer
# cW_in = h
import numpy as np

class Embedding:
    '''
    Embedding层本质上是把传入的c（one-hot）表示的行取出来，取出来的的行就是h，也即隐藏层神经元
    '''
    def __inti__(self, W):
        # prams grads是成员变量，idx用于提取params和grads的行
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out
    
    def backward(self, dout):
        dW, = self.grads

        # 保持dW形状不变，将dW的所有元素变成0
        dW[...] = 0
        # dW[self.idx] = dout

        # 加法是为了处理self.idx中可能出现多个同样的值
        for i, word_id in enumerate(self.idx):
            dW[word_id] += dout[i]
        # or
        # np.add.at(dW, self.idx, dout)
        return None

In [None]:
# Embedding dot 层：h->(embedding dot)-> W_out -> sigmoid with loss (不是softmax with loss)
# 看书上p147图4-14
class EmbeddingDot:
    def __init__(self, W):
        #成员变量
        # 这里的W是W_out，也即输出层的参数矩阵
        self.embed = Embedding(W)
        self.params = self.embed.params
        self.grads = self.embed.grads
        self.cache = None

    def forward(self, h, idx):
        # idx是一个列表（ndarray），因为我们都是处理minibatch，不会只处理一个单词
        # target_W 是实际参与输出层计算的输出层参数矩阵的列（负采样的策略：只计算目标词代表的列向量）
        target_W = self.embed.forward(idx)
        #沿着axis=1（沿着行加）加
        out = np.sum(target_W*h, axis = 1)
        # self.cache保存正向传播的计算结果
        self.cache = (h, target_W)
        return out
    
    def backward(self, dout):
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)

        dtarget_W = dout*h

        # backward更新输出层参数W_out
        self.embed.backward(dtarget_W)
        dh = dout*target_W
        #embedding层的backward不会返回任何东西，但是embeddingdot层因为有内积运算，所以会返回dh
        return dh

In [26]:
# 基于概率分布的负采样：出现频率越高的词越有可能被采样
GPU = False # 在config中设定：from common.config import GPU
import collections
class UnigramSampler:
    # power: <1 平滑处理，让低频的词没有那么难被抽到
    def __init__(self, corpus, power, sample_size):
        self.sample_size = sample_size
        self.vocab_size = None
        self.word_p = None
        
        #原始代码的实现方式（低效）
        # counts = collections.Counter()
        # for word_id in corpus:
        #     counts[word_id] += 1
        counts = collections.Counter(corpus)

        vocab_size = len(counts)
        self.vocab_size = vocab_size

        self.word_p = np.zeros(vocab_size)
        for i in range(vocab_size):
            self.word_p[i] = counts[i]

        self.word_p = np.power(self.word_p, power)
        self.word_p /= np.sum(self.word_p)

    def get_negative_sample(self, target):
        batch_size = target.shape[0]

        if not GPU:
            negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)

            for i in range(batch_size):
                p = self.word_p.copy()
                target_idx = target[i]
                p[target_idx] = 0
                p /= p.sum()
                negative_sample[i, :] = np.random.choice(self.vocab_size, size=self.sample_size, replace=False, p=p)
        else:
            # 在用GPU(cupy）计算时，优先速度
            # 有时目标词存在于负例中
            negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size),
                                               replace=True, p=self.word_p)

        return negative_sample

In [31]:
# 测试negative sampleer的效果
corpus = np.array([0,1,2,3,4,1,2,3])
power = 0.75
sample_size = 2

sampler = UnigramSampler(corpus, power, sample_size)
target = np.array([1,3,0])
negative_sample = sampler.get_negative_sample(target)
print(negative_sample)



[[2 0]
 [1 0]
 [2 3]]


In [32]:
# 因为这里没用define by run的架构，所以所有涉及到参数矩阵的计算都需要依托层来实现，因为需要定义反向传播的方法

def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 在监督标签为one-hot-vector的情况下，转换为正确解标签的索引
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]

    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

class SigmoidWithLoss:
    def __init__(self):
        self.params, self.grads = [], []
        self.loss = None
        self.y = None  # sigmoid的输出
        self.t = None  # 监督标签

    def forward(self, x, t):
        self.t = t
        self.y = 1 / (1 + np.exp(-x))

        self.loss = cross_entropy_error(np.c_[1 - self.y, self.y], self.t)

        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        dx = (self.y - self.t) * dout / batch_size
        return dx

In [33]:
class NegativeSamplingLoss:
    # sample_size 指的是负采样要采样多少个负例
    def __init__(self, W, corpus, power = 0.75, sample_size = 5):
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)

        # 这里生成了sample_size + 1层，因为需要多一个正例的层
        # 每一个负例都需要与输出侧的参数矩阵计算并计算loss，最后再把所有负例的loss加起来
        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]

        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]
        self.params, self.grads = [], []
        for layer in self.embed_dot_layers:
            self.params += layer.params
            self.grads += layer.grads
        # loss_layer[0]和embed_dot_layers[0]是处理正例的层，其他是处理负例

    def forward(self, h, target):
        # target 指的是目标词，除了目标词以外，其他都是负例
        batch_size = target.shape[0]
        negative_sample = self.sampler.get_negative_sample(target)

        # 正例的前向传播
        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype = np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)

        # 负例的前向传播
        # 负例一共有batch_size * sample_size个，但是他们都可以统一用一个label 0
        # 因此只需要生成batch_size个negative_label就可以了
        negative_label = np.zeros(batch_size, dtype = np.int32)
        for i in range(self.sample_size):
            negative_target = negative_sample[:, i]
            # h dot W_out -> score
            score = self.embed_dot_layers[i + 1].forward(h, negative_target)
            # score -> loss
            loss += self.loss_layers[i + 1].forward(score, negative_label)

        return loss
    
    def backward(self, dout = 1):
        dh = 0
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
            #将dout沿着相反的方向回传，以正向传播相反的顺序调用各层的backward()
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)

        return dh


In [None]:
# CBoW的实现
class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        V, H = vocab_size, hidden_size

        # 初始化权重
        W_in = 0.01*np.random.randn(V, H).astype('f')
        W_out = 0.01*np.random.randn(V, H).astype('f')

        # 生成输入层W_in，一共有2*window_size=context size个输入层
        # 多个输入层共享权重
        self.in_layers = []
        for i in range(2*window_size):
            layer = Embedding(W_in)
            self.in_layers.append(layer)
        
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power = 0.75, sample_size = 5)

        # 将权重整理到列表，注意，这里为什么只出现了in_layer和ns_loss，out_layer去哪里了？
        # 我们已经把out_layer(W_out)和loss的计算合并成一个层ns_loss了
        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers:
            # 这里的 + 是给列表添加元素，而不是加法运算
            # 由于多个输入层共享权重W_in，实际上self.params里面的layer.params都是重复的
            self.params += layer.params
            self.grads += layer.grads
        # W_in就是单词的分布式表示
        self.word_vecs = W_in

    def forward(self, contexts, target):
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:, i])
            h *= 1/len(self.in_layers)
            loss = self.ns_loss.forward(h, target)
            return loss
        
    def backward(self, dout = 1):
        dout = self.ns_loss.backward(dout)
        dout *= 1/len(self.in_layers)

        # 正向传播中，有2*context_size个context words
        # 尽管每个层的grads都是独立初始化的，但每个层在backward时都会把各自的grads累加到共享的W_in上，实际上，每个层的grads都是一样的（正向传播时对多个context words的h取了平均）
        for layer in self.in_layers:
            layer.backward(dout)
        return None








