# SimpleRNNLM

### Embedding
X(batch_size,T),We(len(co_matrix),hyperparam1)    
            ↓    
y(batch_size,T,hyperparam1)
### TimeRNN
X(batch_size,T,hyperparam1),Wx(hyperparam1,hyperparam2),Wh(hyperparam2,hyperparam2),Br(1,hyperparam2)    
            ↓     
y(batch_size,T,hyperparam2)
### TimeAffine
X(batch_size,T,hyperparam2),Wa(hyperparam2,hyperparam3),Ba(1,hyperparam3)    
            ↓     
y(batch_size,T,hyperparam3)
### Softmax
x(batch_size,T,hyperparam3)     
            ↓
y(batch_size,T,hyperparam3)
### TimePerplexityLoss
x(batch_size,T,hyperparam3)     
            ↓     
y(1)

In [38]:
import numpy as np
from collections import OrderedDict
import matplotlib.pyplot as plt

In [39]:
class SimpleRnnlm():
    
    def __init__(self,init_W,init_b,time_T,batch_size,lr=0.01):
        
        We,Wx,Wh,Wa = init_W
        Br,Ba = init_b
        self.batch_size = batch_size
        self.lr = lr
        
        self.layers = OrderedDict()
        
        self.layers['embedding'] = Embedding(We)
        self.layers['TimeRNN'] = TimeRNN(Wx,Wh,Br,time_T)
        self.layers['TimeAffine'] = Affine(Wa,Ba)
        self.layers['TimeSoftmax'] = Softmax()
        self.layers['TimePerPlexityLoss'] = PerPlexityLoss()
        
    def forward(self,X,T_id):
        
        for layer in self.layers.values()[:-1]:
            X = layer.forward(X)
        
        total_loss = self.layers['TimePerPlexityLoss'].forward(X,T_id)
        
        return total_loss
    
    def backward(self,dout=1):
        
        for layer in reversed(self.layers.values()):
            dout = layer.backward(dout)      
        
        return
    
    def renew_params(self):
        
        layer_list = [self.layers['embedding'],
              self.layers['TimeRNN'],
              self.layers['TimeAffine']]
        
        for layer in layer_list:
            for key in layer.params.keys():
                layer.params[key] -= self.lr*layer.grads[key]
                
        return
    
    def train(self,input_corpus,iter_num=100):
        
        loss_list = []
        iter_index_list = []
        
        N = self.batch_size
        T = len(input_corpus)
        
        #生成多个batch_size所需要的数据,batch_data之间最小的间距为1
        batch_dist = max(int(T/N),1)
        X = np.empty((N,T))
        T_id = np.empty((N,T,1))
        
        for i in range(N):
            X[i] = input_corpus[i*batch_dist:]+input_corpus[:i*batch_dist]
            T_id[i] = input_corpus[i*batch_dist+1:]+input_corpus[:i*batch_dist+1].reshape((T,1))
        
        #根据循环次数进行训练
        for n in range(iter_num):
            
            #进行一次前向传播和反向传播
            loss = self.forward(X,T_id)
            self.backward(dout=1)
            
            #根据反向传播导数更新参数
            renew_params()
            
            if n%10 == 0:
                print(f'第{n}次的损失为：',loss)
                loss_list.append(loss)
                iter_index_list.append(n)              
        
        #画图观察训练过程中loss的变化
        fig = plt.figure(figsize=(10,8))
        plt.plot(iter_index_list,loss_list)
        fig.show()
        
        return

In [40]:
class Embedding():
    
    def __init__(self,W):
        
        self.params = {}
        self.params['W'] = W
        
        self.grads = {}
    
    def forward(self,X):
        
        '''
        X(N,len_corpus),W(max_id_in_corpus,WW)
        这里的X是单词id形式，不是one hot形式.
        N是batch_size.
        T是corpus的长度，即需要训练的这段话总共有多少字,也是TimeRNN包含RNN单元的数量值
        WW是分布式向量的维度
        '''
        N,T = X.shape
        MD,WW = self.params['W'].shape
        
        y = np.empty((N,T,WW))
        
        for i in range(N):
            y[i] = W[X[i]]
                
        self.params['X'] = X
                
        return y
    
    def backward(self,dout):
        
        #因为backward中导数存在+=操作，因此每次反向传播开始前，需要先把导数清零
        self.grads['W'] = np.zeros_like(W)
        
        for i in range(N):
            self.grads['W'][X[i]] += dout[i]

        return self.grads['W']

In [41]:
class RNN():
    
    def __init__(self,Wx,Wh,Br):
        
        self.params = {}
        self.params['Wx'] = Wx
        self.params['Wh'] = Wh
        self.params['Br'] = Br
        
        self.grads = {}        
    
    def forward(self,X,h_prev):
        
        Wx,Wh,Br = self.params['Wx'],self.params['Wh'],self.params['Br']
        
        h_next = np.tan(np.dot(X,Wx) + np.dot(h_prev,Wh) + Br)
        
        self.cache = []
        
        self.cache.append(h_next)
        self.cache.append(X)
        self.cache.append(h_prev)
        
        return h_next
    
    def backward(self,dout):
        
        '''
        此处的dout是dh_next和d_out之和,
        dh_next和d_out分别为这个RNN单元输出的两个值h_next和h_out的导数
        '''
        
        #每次反向传播开始前先把导数清理
        self.grads['Wx'] = np.zeros_like(Wx)
        self.grads['Wh'] = np.zeros_like(Wh)
        self.grads['Br'] = np.zeros_like(Br)        
        
        Wx,Wh,Br = self.params['Wx'],self.params['Wh'],self.params['Br']
        h_next,X,h_prev = self.cache
        
        dtan = dout * (1 + h_next**2)
        dx = np.dot(dtan,Wx.T)
        dwx = np.dot(dtan,X.T)
        dh_prev = np.dot(dtan,Wh.T)
        dwh = np.dot(dtan,h_prev.T)
        dbr = np.sum(dtan,axis=0)
        
        self.grads['Wx'] = dwx
        self.grads['Wh'] = dwh
        self.grads['Br'] = dbr    
        
        return dx,dh_prev

In [42]:
class TimeRNN():
    
    def __init__(self,Wx,Wh,Br,T,statful=False,truncate_size=10):
        
        '''
        T是Time RNN中包含的RNN单元的个数，也是整个时间序列的长度
        statful是控制是否要在反向传播的时候按一定距离对反向传播的导数进行阶段的开关
        truncate_size进行截断反向传播操作的RNN间距
        '''
        
        #初始化TimeRNN的参数
        self.params = {}
        self.params['Wx'] = Wx
        self.params['Wh'] = Wh
        self.params['Br'] = Br
        
        #初始化TimeRNN的导数列表
        self.grads = {}
        
        #初始化前向传播的记忆
        self.h = 0
        
        #初始化反向传播的记忆
        self.dh_prev = 0
        
        #初始化Truncate状态
        self.statful = statful
        self.truncate_size = truncate_size
        
        #初始化具体的各个RNN单元
        self.layers = []
        for i in range(T):
            layer = RNN(Wx,Wh,Br)
            self.layers.append(layer)
            
    def fowrward(self,Xs):
        
        #初始化前向传播最终输出的array
        Wx,Wh,Br = self.params['Wx'],self.params['Wh'],self.params['Br']
        N,T,D = Xs.shape
        D,H = Wx.shape
        Hs = np.empty((N,T,H))
        
        #对每个时刻T分别计算其输出h,并将生成的每个h放到hs集合中合适的位置
        for T,layer in enumerate(self.layers):
            
            X = Xs[:,T,:]
            h_prev = self.h
            
            self.h = layer.forward(X,h_prev)
            
            Hs[:,T,:] = self.h
        
        return Hs
    
    def backward(self,dhs):
        
        #因为backward的权重更新中有+=操作，需要在每次backward之前将grads初始化为0
        Wx,Wh,Br = self.params['Wx'],self.params['Wh'],self.params['Br']
        self.grads['Wx'] = np.zeros_like(Wx)
        self.grads['Wh'] = np.zeros_like(Wh)
        self.grads['Br'] = np.zeros_like(Br)
         
        len_of_layers = len(self.layers)
        
        #反向传播需要从最后一个RNN开始，这里用了reversed将存放RNN的列表进行了翻转
        #为了让时刻T也进行翻转，用enumerate先产生了reversed_T,再用这个推导出跟RNN匹配的编号T
        for reversed_T,layer in enumerate(reversed(self.layers)):
            
            T = len_of_layers - reversed_T -1

            dh_out = dhs[:,T,:]

            if (T+1)%self.truncate_size == 0 and self.statful == True:self.dh_prev = 0

            dout = dh_out + self.dh_prev

            dx,self.dh_prev = layer.backward(dout)
            
            #因为每个RNN的权重和偏置一样。
            #与其每算出来一个RNN的导数更新一次权重和偏置
            #还不如把所有的导数都叠加起来，最后更新一次偏置和偏置
            self.grads['Wx'] += layer.grads['Wx']
            self.grads['Wh'] += layer.grads['Wh']
            self.grads['Br'] += layer.grads['Br']
        
        return dxs

In [43]:
class TimeAffine():
    
    def __init__(self,Wa,Ba):
        
        self.params = {}
        self.params['Wa'] = Wa
        self.params['Ba'] = Ba
        
        self.grads = {}
        self.grads['Wa'] = np.zeros_like(Wa)
        self.grads['Ba'] = np.zeros_like(Ba)
    
    def forward(self,X):
        
        Wa,Ba = self.params['Wa'],self.params['Ba']
        N,T,H = X.shape
        H,D = Wa.shape

        #全链接层的权重只有两个维度，而输入的X包含N,T,H三个维度，需要设置两个中间变量X_tmp和y_tmp来进行转换
        X_tmp = X.reshape((N*T,H))
        self.params['X_tmp'] = X_tmp

        y_tmp = np.dot(X_tmp,Wa) + Ba
        
        y = y_tmp.reshape((N,T,D))
        
        return y
    
    def backward(self,dout):
        
        Wa,Ba,X_tmp = self.params['Wa'],self.params['Ba'],self.params['X_tmp']
        N,T,D = dout.shape
        H,D = Wa.shape
        
        #反向传播与前向传播同理，而输入的dout是三个维度的，无法直接与W.T和X.T进行矩阵点乘，所以同样也需要进行中间转换
        dout_tmp = dout.reshape((N*T,D))
        
        dx_tmp = np.dot(dout_tmp,Wa.T)
        dx = dx_tmp.reshape((N,T,H))
        
        self.grads['Wa'] = np.dot(X_tmp.T,dout_tmp)
        self.grads['Ba'] = np.sum(dout_tmp,axis=0)
        
        return dx

In [44]:
class TimeSoftmax():
    
    def __init__(self):
        
        self.params = {}
        self.grads = {}
        
    def __forward(self,X):
        
        self.params['X'] = X
        N,T,D = X.shape
        
        X_tmp = X.reshape((N*T,D))
        y_tmp = np.exp(X_tmp)/np.sum(np.exp(X_tmp),axis=1,keepdims=True).repeat(D,axis=1)
        y = y_tmp.reshape((N,T,D))
        
        #存储y,在反向传播中会用到
        self.params['y'] = y
        
        return y
    
    def backward(self,dout):
        
        y = self.params['y']
        
        #softmax的导数为y(1-y)，其中y为每个元素正向传播的结果
        dx = dout*(y*(1-y))
        
        self.grads['X'] = dx 
        
        return dx

In [45]:
class TimePerPlexityLoss():
    
    def __init__(self,eps = 1e-8):
        
        #因为采用困惑度作为loss衡量标准，为了防止出现0为分母的情况，所以需要加上一个微小数eps
        self.params ={}
        self.grads = {}
        self.eps = eps
        
    def forward(self,X,T_id):
        
        #input数据T_id需要用id的方式表示,T_id.shape = (N,T,1)
        #只需要把每个长度为D的向量的序号为id的那个值拿出来，计算它的倒数，即为这个单词的损失
        #最终损失需要把N*T个单词的损失全加起来之后除以N*T,表示每个单词的平均困惑度。
        #如果完全拟合的情况下，即最终损失最小的值，为1
        
        loss_t = 0
        
        N,T,D = X.shape
        self.params['X'] = X
        self.params['T_id'] = T_id
        
        for i in range(N):
            for j in range(T):
                loss = 1/(X[i,j,T_id[i,j,0]] + self.eps)
                loss_t += loss
        
        loss_t = loss_t/(N*T)
        
        return loss_t
    
    def backward(self,dout=1):
        
        X,T_id = self.params['X'],self.params['T_id']
        N,T,D = X.shape
        
        dx = np.zeros_like(X)
        for i in range(N):
            for j in range(T):
                dx[i,j,T_id[i,j,0]] += dout*(-1/(X[i,j,T_id[i,j,0]]**2))
        
        self.grads['X'] =dx
        
        return dx