In [64]:
import numpy as np

In [65]:
class Affine:
    '''
    一个带有偏置的全链接层，偏置默认是零
    '''
    def __init__(self,W,b=0):
        
        self.params = {}
        self.params['W'] = W
        self.params['b'] = b
        self.grads = {}
        self.grads['W'] = np.zeros_like(self.params['W'])
        self.grads['b'] = np.zeros_like(self.params['b'])
        
    def forward(self,X):
        
        self.params['X'] = X
        
        return np.dot(X,self.params['W'])+self.params['b']
    
    def backward(self,dout):
        
        self.grads['W'] = np.dot(dout,self.params['X'].T)
        self.grads['b'] = np.sum(dout,axis=0)
        self.grads['X'] = np.dot(dout,self.params['W'].T)
        
        return self.grads['X']

In [66]:
class Embedding():
    '''
    根据单词在语义库中的id，返回权重列表中对应的权重数据，不带偏置
    '''
    def __init__(self,W):
        
        self.params = {}
        self.params['W'] = W
        self.grads = {}
        self.grads['W'] = np.zeros_like(self.params['W'])
        
    def forward(self,X):
        
        #为了计算方便，X代表的数据集需要以单词的id形式表达，而不是one hot形式，如[0,3,2,4,2,1]，而不是[[0,0,0,0,1],[0,0,1,0,0]……]。
        
        if type(X) == list :
            self.params['X'] = X
            return self.params['W'][X]
        
        else:
            print('X代表的数据集需要以单词的id形式表达')

    def backward(self,dout):
        
        #dout中的第n行数据，乘以1之后，是W中第data_id行的导数
        for n,data_id in enumerate(self.params['X']):
            self.grads['W'][data_id] += dout[n]*1
        

In [None]:
#测试FeatEmbedding()类
if __name__ == '__main__':
    W =np.random.randn(5,5)
    fe1 = Embedding(W)
    X = [0,3,1]
    dout = np.random.rand(3,5)
    print('fe1.forward(X):\n',fe1.forward(X),'\n')
    fe1.backward(dout)
    print('fe1.params:\n',fe1.params,'\n')
    print('dout:\n',dout,'\n')
    print('fe1.grads:\n',fe1.grads)

In [67]:
class Sigmoid():
    
    '''
    一个sigmoid层，为了防止正向传播数值太小结果为零的情况，默认添加一个1e-8的微小数
    '''
    
    def __init__(self,eps=1e-8):
        
        self.eps = eps
        self.params = {}
        self.grads = {}
    
    def forward(self,X):
        
        y = 1/(1+np.exp(-1*X)) + self.eps
        
        self.y = y
                
        return y
    
    def backward(self,dout):
        
        self.grads['X'] = dout*self.y*(1-self.y)
        
        return self.grads['X']
            

In [None]:
#测试sigmoid类
if __name__ == '__main__':
    s= Sigmoid()
    X = np.random.randn(3,5)
    dout = np.random.randn(3,5)
    print('X:\n',X,'\n')
    print('dout:\n',dout,'\n')
    print('s.forward(X):\n',s.forward(X),'\n')
    print('s.backward(dout):\n',s.backward(dout),'\n')

In [68]:
class CrossEntropyError():
    
    '''
    交叉熵损失函数
    '''
    
    def __init__(self):
        
        self.params = {}
        self.grads = {}
        
    def forward(self,X,y):
        
        total_loss = np.sum(-y*np.log(X)) 
        self.y = y
        self.params['X'] = X
        
        return total_loss
    
    def backward(self,dout=1):
        
        self.grads['X'] = (-1/self.params['X'])*self.y*dout
        
        return self.grads['X']
        

In [None]:
#交叉熵损失测试
if __name__ == '__main__':
    tl = CrossEntropyError()
    a=[0,1]
    X = 0.5*np.random.choice(a,size=(10))+0.1
    y = np.random.choice(a,size=(10))
    print('X:\n',X,'\n')
    print('y:\n',y,'\n')
    print('tl.forward(X,y):\n',tl.forward(X,y),'\n')
    print('tl.backward():\n',tl.backward(),'\n')

In [69]:
class EmbeddingDot():
    
    '''
    根据目标在语义库中的id，返回权重列表中对应的数据列，然后与对应的中间层神经元相乘后得到结果
    '''
    
    def __init__(self,W):
        
        #W表示的是输出层权重矩阵
        
        self.params = {}
        self.params['W'] = W
        self.grads = {}
        self.grads['W'] = np.zeros_like(self.params['W'])
        
    def forward(self,X,y):
        
        #X代表的是CBOW模型中间层神经元的数值，y表示的是对应目标值的id，数据集需要以id形式表达成list，而不是one hot形式，如[0,3,2,4,2,1]，而不是[[0,0,0,0,1],[0,0,1,0,0]……]。
        
        if type(y) == list :
            
            self.params['X'] = X
            self.y = y
            
            return np.sum(W.T[y]*X,axis=1)
        
        else:
            print('目标数据集需要以列表的格式填入目标单词的id')

    def backward(self,dout):
        
        #X的导数把
        self.grads['X'] = dout.reshape(-1,1).repeat(len(self.params['W']),axis=1)*self.params['W'].T[self.y]
        
        #W的导数绝大部分是0，所以先把输入数据对应的权重列先找出来，然后再更新对应的权重，跟输入数据无权的权重不更新，还是0
        for n,target_id in enumerate(self.y):
            self.grads['W'][:,target_id] += dout[n]*X[n].T
        
        return self.grads['X']
        

In [None]:
#测试EmbeddingDot()类
if __name__ == '__main__':
    W = np.random.randn(3,5)
    X = np.random.randn(2,3)
    y = [3,1]
    dout = np.random.randn(2)
    
    print('W\n',W,'\n')
    print('X\n',X,'\n')
    print('y\n',y,'\n')
    print('dout\n',dout,'\n')
    
    ed = EmbeddingDot(W)
    print('ed.forward(X,y)\n',ed.forward(X,y),'\n')
    print('ed.backward(dout)\n',ed.backward(dout),'\n')
    print('dw:\n',ed.grads['W'],'\n')

In [70]:
def  negative_sampling(corpus,co_matrix,word_to_id,id_to_word,postive_target,negative_sampling_size=2):
    
    '''
    根据语料库、共现矩阵、已确定的正采样样本，需要负采样的个数，来生成包含正负采样的训练目标字典
    '''
    
    p = np.sum(co_matrix,axis=0)/np.sum(co_matrix)  #根据共现矩阵计算每个单词的分布概率
    p = np.power(p,0.75)/np.sum(np.power(p,0.75))   #将每个单词的概率通过0.75指数转换，以增大概率比较低的值的概率
    max_id = max(corpus)+1  #找到语料库中单词的id的最大值+1，作为随机选取的对象
    
    negative_sampling = np.zeros((len(postive_target),negative_sampling_size))
        
    #分别对每一个正采样进行循环，随机选出一定数量的负采样样本id，且满足正负采样id不相同
    for n,target_id in enumerate(postive_target):
        go_on = False
        while not go_on:
            negative_id = np.random.choice(max_id,replace=False,size=negative_sampling_size,p=p)
            if target_id not in negative_id:
                go_on = True
        negative_sampling[n] = negative_id
    
    return negative_sampling

In [None]:
#negative_sampling()负采样功能测试
if __name__ == '__main__':
    from chapter2 import s2c,c2m
    input_str = 'you say goodbye and i say hello.'
    corpus = s2c(input_str)
    word_to_id = corpus['word_to_id']
    id_to_word = corpus['id_to_word']
    co_matrix = c2m(corpus)
    corpus = corpus['corpus']
    postive_target = [1,2,3,4,5]
    sampling_result = negative_sampling(corpus,co_matrix,word_to_id,id_to_word,postive_target,negative_sampling_size=2)
    print(sampling_result.items())

In [71]:
def creat_CBOW_train_data(corpus,word_to_id,id_to_word,window_size=1,out_format = 'one_hot'):
    
    '''
    根据语料库，单词和id对应字典，id和单词对应字典，窗口大小，以CBOW模型，转换成包含feat和target两部分的train data
    输出的格式可以选择'one-hot'和'id'
    '''
    
    corpus_len = len(corpus)
    word_to_id_len = len(word_to_id.items())
    
    if out_format == 'one_hot':
        
        train_feat = np.zeros((int(corpus_len-2*window_size),int(2*window_size),int(word_to_id_len)))
        train_target = np.zeros((int(corpus_len-2*window_size),int(word_to_id_len)))
        
        for n,data_id in enumerate(corpus):                           #对于每一个语料库中的单词进行循环

            if n-window_size >= 0 and n+window_size <= corpus_len-1:  #控制循环不超过语料库的边界

                target_data = np.zeros(word_to_id_len)                  #生成train_target数据
                target_data[data_id] =1
                train_target[n-window_size] = target_data

                for distance in range(1,window_size+1):                 #生成目标左边和右边的train_feat，一个train_feat包含多个数据,一个train_feat包含的数据数量是窗口大小的两倍

                    left_data_index = n-distance
                    right_data_index = n+distance

                    left_data_id = corpus[left_data_index]
                    right_data_id = corpus[right_data_index]

                    left_data = np.zeros(word_to_id_len)
                    left_data[left_data_id] = 1

                    right_data = np.zeros(word_to_id_len)
                    right_data[right_data_id] =1

                    train_feat[n-window_size,window_size-distance] = left_data
                    train_feat[n-window_size,window_size+distance-1] = right_data

        return train_feat,train_target
    
    if out_format == 'id':
        
        train_feat = np.zeros((int(corpus_len-2*window_size),int(2*window_size)))
        train_target = np.zeros(int(corpus_len-2*window_size))
        
        for n,data_id in enumerate(corpus):                           #对于每一个语料库中的单词进行循环

            if n-window_size >= 0 and n+window_size <= corpus_len-1:  #控制循环不超过语料库的边界

                train_target[n-window_size] = int(data_id)            #生成train_target数据

                for distance in range(1,window_size+1):                 #生成目标左边和右边的train_feat，一个train_feat包含多个数据,一个train_feat包含的数据数量是窗口大小的两倍

                    left_data_index = n-distance
                    right_data_index = n+distance

                    left_data_id = corpus[left_data_index]
                    right_data_id = corpus[right_data_index]

                    train_feat[n-window_size,window_size-distance] = left_data_id
                    train_feat[n-window_size,window_size+distance-1] = right_data_id
        
        return train_feat,train_target


In [72]:
def c2m(corpus,word_to_id,id_to_word,window_size=1):

    '''
    corpus to co-matrix
    '''

    co_matrix_dims = (len(word_to_id),len(word_to_id))
    co_matrix = np.zeros(co_matrix_dims)
    
    for word_index,word_id in enumerate(corpus):
        for applied_window_size in range(1,window_size+1):
            
            if word_index-applied_window_size >= 0:
                left_word_index = word_index-applied_window_size
                left_word_id = corpus[left_word_index]
                co_matrix[word_id,left_word_id] +=1
            
            if word_index+applied_window_size <= len(corpus)-1:
                right_word_index = word_index+applied_window_size
                right_word_id = corpus[right_word_index]
                co_matrix[word_id,right_word_id] +=1
                
    return np.array(co_matrix)

In [75]:
def get_ptb_data(window_size=5,data_format = 'id',negative_sampling_size=4,batch_size=100):
    import sys
    import os
    sys.path.append('..')
    import pickle

    dataset_dir = os.path.abspath('')
    vocab_path = dataset_dir + '\\ptb' + '\\ptb.vocab.pkl'
    save_path = dataset_dir + '\\ptb' + '\\ptb.train.npy'

    with open(vocab_path, 'rb') as f:
        word_to_id, id_to_word = pickle.load(f)
        corpus = np.load(save_path)

    co_matrix = c2m(corpus,word_to_id,id_to_word,window_size=window_size)
    train_feat,train_postive_target = creat_CBOW_train_data(corpus,word_to_id,id_to_word,window_size=window_size,out_format = data_format)
    train_negative_target = negative_sampling(corpus,co_matrix,word_to_id,id_to_word,train_postive_target,negative_sampling_size=negative_sampling_size)
    
    for i in range(int(len(train_postive_target)/batch_size)+1):
        yield train_feat[batch_size*i:min(batch_size*(i+1),len(train_postive_target))],train_postive_target[batch_size*i:min(batch_size*(i+1),len(train_postive_target))],train_negative_target[batch_size*i:min(batch_size*(i+1),len(train_postive_target))]


In [76]:
#测试get_ptb_data
if __name__ == '__main__':
    batch_size_ptb_data = get_ptb_data()
    train_feat,train_postive_target,train_negative_targe = next(batch_size_ptb_data)
    print('down')

down


In [77]:
for i in range(10):
    train_feat[0,i]

array([[ 0.,  1.,  2.,  3.,  4.,  6.,  7.,  8.,  9., 10.],
       [ 1.,  2.,  3.,  4.,  5.,  7.,  8.,  9., 10., 11.],
       [ 2.,  3.,  4.,  5.,  6.,  8.,  9., 10., 11., 12.],
       [ 3.,  4.,  5.,  6.,  7.,  9., 10., 11., 12., 13.],
       [ 4.,  5.,  6.,  7.,  8., 10., 11., 12., 13., 14.],
       [ 5.,  6.,  7.,  8.,  9., 11., 12., 13., 14., 15.],
       [ 6.,  7.,  8.,  9., 10., 12., 13., 14., 15., 16.],
       [ 7.,  8.,  9., 10., 11., 13., 14., 15., 16., 17.],
       [ 8.,  9., 10., 11., 12., 14., 15., 16., 17., 18.],
       [ 9., 10., 11., 12., 13., 15., 16., 17., 18., 19.],
       [10., 11., 12., 13., 14., 16., 17., 18., 19., 20.],
       [11., 12., 13., 14., 15., 17., 18., 19., 20., 21.],
       [12., 13., 14., 15., 16., 18., 19., 20., 21., 22.],
       [13., 14., 15., 16., 17., 19., 20., 21., 22., 23.],
       [14., 15., 16., 17., 18., 20., 21., 22., 23., 24.],
       [15., 16., 17., 18., 19., 21., 22., 23., 24., 25.],
       [16., 17., 18., 19., 20., 22., 23., 24., 25., 26.

In [78]:
train_postive_target

array([ 5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15., 16., 17.,
       18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30.,
       31., 32., 33., 34., 35., 36., 37., 38., 27., 24., 39., 26., 40.,
       41., 42., 26., 43., 32., 44., 45., 46., 24., 47., 26., 27., 28.,
       29., 48., 49., 41., 42., 50., 51., 52., 53., 54., 55., 35., 36.,
       37., 42., 56., 57., 58., 59., 24., 35., 60., 42., 61., 62., 63.,
       64., 65., 66., 67., 68., 69., 70., 35., 71., 72., 42., 73., 74.,
       75., 35., 46., 42., 76., 77., 64., 78., 79.])

In [79]:
train_negative_targe

array([[4703., 8878., 1332., 2017.],
       [6652., 1159., 2936., 1705.],
       [9993., 5072., 2393.,  590.],
       [ 152.,   42., 2109., 9307.],
       [  32.,  871.,   64.,  449.],
       [ 493.,   27.,   26., 5608.],
       [  42., 3397., 3012.,  241.],
       [  24., 2978.,   80.,  566.],
       [ 109., 1435., 2411., 3745.],
       [  64., 4683.,   40.,  874.],
       [8606., 2152., 1436., 3316.],
       [  41.,  154.,   64.,  703.],
       [ 100., 4098., 1442.,  532.],
       [ 935.,  885.,   40.,  988.],
       [ 468., 1722.,   26., 7639.],
       [3181.,   32., 6091.,   34.],
       [ 157., 6528., 4399., 2652.],
       [ 154.,   24., 4434., 1154.],
       [5871., 5027., 3495., 3680.],
       [1887., 6219., 9281.,  948.],
       [3910., 1763., 1200.,   35.],
       [2866.,  200.,   37., 2629.],
       [ 718.,  101., 1378.,   48.],
       [ 181.,  806., 4794., 2303.],
       [  24.,  225.,   32.,   95.],
       [4626., 1085., 4228.,   98.],
       [1217., 3433., 1106., 4278.],
 