In [1]:
import collections
import torch
import re

In [2]:
with open('朱自清.txt',encoding='utf-8') as f:
    corpus_chars = f.read()
corpus_chars[:100] 

'\u3000\u3000燕子去了，有再来的时候；杨柳枯了，有再青的时候；桃花谢了，有再开的时候。但是，聪明的，你告诉我，我们的日子为什么一去不复返呢？——是有人偷了他们罢：那是谁？又藏在何处呢？是他们自己逃走了罢：现在又'

In [3]:
corpus_chars = corpus_chars.replace('\n',' ').replace('\u3000',' ').replace('\r',' ').replace('  ',' ').replace('  ',' ')
corpus_chars=corpus_chars[:50000] 
len(corpus_chars)

50000

In [5]:
# 建立字符索引，把每个字符映射成一个从0开始的连续整数
idx_to_char = list(set(corpus_chars)) # 所有不同的字符的列表
char_to_idx = dict([(char,i) for i,char in enumerate(idx_to_char)]) # ｛char:idx}
vocab_size = len(char_to_idx)
vocab_size # 词典中不同字符的个数
char_to_idx.get('汇')

In [32]:
corpus_indices = [char_to_idx[char] for char in corpus_chars] # 将每个字符转换成索引
sample = corpus_indices[:20]
print('chars:',''.join([idx_to_char[idx] for idx in sample]))
print('indices:',sample) # 空格为1467

chars:   燕子 去 了 ， 有 再 来 的 时
indices: [1312, 1312, 1296, 1192, 1312, 1083, 1312, 173, 1312, 98, 1312, 597, 1312, 453, 1312, 311, 1312, 614, 1312, 291]


In [33]:
# 获取特征词向量，用word2vec训练
import gensim
def train_word2vec_model(text,save_path):
    model = gensim.models.Word2Vec(text,vector_size=100,min_count=1,window=5)
    model.save(save_path)
    return model

In [34]:
word2vec_model = train_word2vec_model(corpus_chars,'word2vec.model').wv
char_to_vec= dict() # 字与向量的映射
# char_to_idx
for i in idx_to_char:
    char_to_vec[i] = word2vec_model[i]
char_to_vec['风']

array([-2.8666472e-03, -5.8289622e-03,  6.4622974e-03,  1.1263156e-03,
       -2.8119159e-03,  6.1453986e-03, -9.2387199e-06,  1.0511898e-03,
        9.0559367e-03, -9.3491720e-03, -5.9714173e-03, -9.5812250e-03,
        6.6310167e-03, -8.9056566e-03, -4.4304682e-03, -1.3209343e-03,
        1.2212896e-03,  1.8017649e-03, -1.5733480e-04, -1.7512107e-03,
       -2.1083809e-03,  6.9443081e-03, -1.7246294e-03,  6.5303803e-03,
        2.9004954e-03, -4.7146892e-03, -7.1290992e-03,  3.3541871e-03,
        4.1413306e-05, -2.6964045e-03,  8.8155838e-03, -2.3798679e-03,
        4.9240352e-04,  5.4495572e-03,  3.4132313e-03, -9.1690710e-03,
       -9.5094442e-03, -5.3740237e-03, -9.3492772e-03, -5.7461503e-04,
        2.3370313e-03,  7.2864629e-03, -1.6360736e-03,  3.7358522e-03,
        1.3703585e-04,  5.6899977e-03, -7.5556971e-03,  8.1030102e-03,
        4.2802882e-03,  5.4254029e-03,  8.3587049e-03, -4.6507598e-04,
        9.0548424e-03, -5.0911712e-03,  3.4983135e-03, -7.7332091e-03,
      

In [35]:
num_steps = 35
batch_size = 2
state = None


In [36]:
# 对时序数据进行采样
# 每次随机读取小批量的样本
# 相邻采样
# 其中batch_size指每个小批量的样本数，num_steps为每个样本所包含的时间步数
def data_iter_consecutive(corpus_indices,batch_size,num_steps):
    corpus_indices = torch.tensor(corpus_indices,dtype=torch.float32)
    data_len = len(corpus_indices)
    batch_len = data_len//batch_size
    indices = corpus_indices[0:batch_size*batch_len].view(batch_size,batch_len)
    epoch_size = (batch_len-1)//num_steps
    for i in range(epoch_size):
        i = i * num_steps
        x = indices[:,i:i+num_steps]
        y = indices[:,i+1:i+num_steps+1]
        yield x,y

In [37]:
example = list(range(30))
for x,y in data_iter_consecutive(example,batch_size=2,num_steps=6):
    print('x:',x,'\ny:',y,'\n')

x: tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) 
y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]]) 

x: tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) 
y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]]) 



In [38]:
import time
import math
import numpy as np
from torch import nn,optim
import torch.nn.functional as F

In [39]:
# 定义模型，一个含单隐藏层，隐藏单元个数为256的循环神经网络层
num_hiddens = 256
rnn_layer = nn.RNN(input_size=100,hidden_size=num_hiddens)
# rnn输入为（时间步数，批量大小，词向量长度）
# 在前向计算后返回输出和隐藏状态h，输出是隐藏层在各个时间步上计算并输出的隐藏状态，形状为（时间步数，批量大小，隐藏单元的个数

In [46]:
# 加载预训练的词向量
def pretrained_embedding_layer(char_to_vec,char_to_idx):
    vocab_len = vocab_size+1 # +1是因为有个0向量占一个位置
    emb_dim = 100
    # 初始化嵌入矩阵
    emb_matrix = np.zeros((vocab_len,emb_dim))
    # 用词向量填充嵌入矩阵，每行为一个词向量
    for char,idx in char_to_idx.items():
        emb_matrix[idx+1,:] = char_to_vec[char] # +1是因为0向量索引为0，词向量是从索引1开始
    # 将嵌入矩阵传入embedding层，作为权重矩阵
    weight = torch.from_numpy(emb_matrix)
    embedding = nn.Embedding.from_pretrained(weight)
    embedding.weight.requires_grad = True
    
    return embedding
    


In [48]:
x=pretrained_embedding_layer(char_to_vec,char_to_idx)(torch.LongTensor([112]))
x=x.view(-1,1,100).float()
rnn_layer(x)

(tensor([[[ 7.8924e-03, -1.3058e-02, -7.0441e-02, -9.7521e-02, -6.6472e-02,
            1.1272e-02,  2.2787e-02,  6.7410e-02, -7.4864e-02, -1.7933e-02,
           -5.3138e-02,  3.7329e-02,  9.5321e-02,  2.5724e-02, -3.7598e-02,
           -3.5511e-02,  5.1815e-02, -3.0662e-02, -3.3188e-02, -1.4269e-03,
           -9.8482e-02,  1.5738e-02, -3.1819e-02, -1.8706e-02, -1.4760e-02,
            4.0744e-02, -7.6507e-02,  4.4752e-02, -5.4792e-02,  9.2973e-02,
           -6.5255e-02,  5.5479e-02, -2.6164e-02,  9.5290e-03,  5.9470e-03,
            4.4912e-02,  3.8407e-02, -6.9121e-02, -1.8717e-03,  5.4405e-03,
            7.9869e-02,  1.8706e-02,  5.1258e-02,  1.2849e-03,  2.8547e-02,
            8.1578e-03, -1.0277e-01, -5.1942e-02,  1.4911e-02,  2.5798e-02,
           -6.1167e-02, -1.1691e-02, -5.5279e-02, -2.9778e-02,  9.3656e-02,
            9.8206e-02, -9.0899e-02,  3.5481e-03,  6.6941e-02, -5.3665e-02,
           -1.6145e-02, -5.9621e-02, -8.1803e-02, -3.7865e-02, -2.7307e-03,
           -

In [49]:
# 搭建rnn模型
class RNNModel(nn.Module):
    def __init__(self,rnn_layer,vocab_size,char_to_vec,char_to_idx):
        super(RNNModel,self).__init__()
        self.embedding = pretrained_embedding_layer(char_to_vec,char_to_idx)
        self.rnn = rnn_layer
        self.hidden_size = rnn_layer.hidden_size*(2 if rnn_layer.bidirectional else 1)
        self.vocab_size = vocab_size+1
        self.dense = nn.Linear(self.hidden_size,vocab_size+1)
        self.state = None
    
    def forward(self,inputs,state): # input:(batch,seq_len)
        X = self.embedding(torch.LongTensor(inputs.numpy())) # 1*100维的LongTensor
                           
        Y, self.state = self.rnn(X.view(-1,1,100).float(),state)
        # 全连接层会首先将Y的形状变成(num_steps * batch_size, num_hiddens)，它的输出
        # 形状为(num_steps * batch_size, vocab_size)
        output = self.dense(Y.view(-1, Y.shape[-1]))
        return output, self.state

In [50]:
def predict_rnn_pytorch(prefix, num_chars, model, vocab_size,idx_to_char,
                      char_to_idx):
    state = None
    output = [char_to_idx[prefix[0]]] # output会记录prefix加上输出
    for t in range(num_chars + len(prefix) - 1):
        X = torch.tensor([output[-1]]).view(1, 1)
        
        if state is not None:
            if isinstance(state, tuple): # LSTM, state:(h, c)  
                state = (state[0], state[1])
            else:   
                state = state
            
        (Y, state) = model(X.view(-1,), state)
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(Y.argmax(dim=1).item()))
    return ''.join([idx_to_char[i] for i in output])

In [51]:
model = RNNModel(rnn_layer, vocab_size,char_to_vec,char_to_idx)
predict_rnn_pytorch('分开', 10, model, vocab_size, idx_to_char, char_to_idx)

'分开溜溜忽棠棠棠棠棠棠棠'

In [52]:
def grad_clipping(params, theta):
    norm = torch.tensor([0.0])
    for param in params:
        norm += (param.grad.data ** 2).sum()
    norm = norm.sqrt().item()
    if norm > theta:
        for param in params:
            param.grad.data *= (theta / norm)

In [53]:
def train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size,
                                corpus_indices, idx_to_char, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size, pred_period, pred_len, prefixes):
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    state = None
    for epoch in range(num_epochs):
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_consecutive(corpus_indices, batch_size, num_steps) # 相邻采样
        for X, Y in data_iter:
            if state is not None:
                # 使用detach函数从计算图分离隐藏状态, 这是为了
                # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大)
                if isinstance (state, tuple): # LSTM, state:(h, c)  
                    state = (state[0].detach(), state[1].detach())
                else:   
                    state = state.detach()
    
            (output, state) = model(X, state) # output: 形状为(num_steps * batch_size, vocab_size)
            
            # Y的形状是(batch_size, num_steps)，转置后再变成长度为
            # batch * num_steps 的向量，这样跟输出的行一一对应
            y = torch.transpose(Y, 0, 1).contiguous().view(-1)
            l = loss(output, y.long())
            
            optimizer.zero_grad()
            l.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=20, norm_type=2)
            optimizer.step()
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]
        
        try:
            perplexity = math.exp(l_sum / n)
        except OverflowError:
            perplexity = float('inf')
        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, perplexity, time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn_pytorch(
                    prefix, pred_len, model, vocab_size, idx_to_char,
                    char_to_idx))

In [22]:
num_epochs, batch_size, lr, clipping_theta = 240, 32, 1e-3, 1e-2 # 注意这里的学习率设置
pred_period, pred_len, prefixes = 30, 50, ['下雨', '明天']
train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size,
                            corpus_indices, idx_to_char, char_to_idx,
                            num_epochs, num_steps, lr, clipping_theta,
                            batch_size, pred_period, pred_len, prefixes)

epoch 30, perplexity 68.777427, time 9.08 sec
 - 下雨他岩"，子这。，的条了，，的的是，，我的也的，我我；，，，的的。，。我，也！，。的是，的是，的的，，
 - 明天当我，们说，我的也的，是，的也"，，而；，。的，是，我我的有，，的说不这的有的也我，，一的。，是我的
epoch 60, perplexity 11.361779, time 9.30 sec
 - 下雨他岩"缀漾岩我上他了，，鲜这南！的，是人友；的人；然国 子有们婚这，本。，以谓不上的的有，是的。这但
 - 明天当却他很是他了野的了，也地的。一她我着人"的有是像那一人。，过讲未等总了起我的，是，佛忙电伯他见的无
epoch 90, perplexity 4.587910, time 9.06 sec
 - 下雨如一的一大学出么的；风。；生。，物。。我的有，，我线实等倒这说上我，见记来。多地看，不"不的钱。—象
 - 明天当却他很是：人说弄他我，哭是欢也。着"他韦恰叶律。！，，的可—是往船。垂来的》了成看怀。，而""，客
epoch 120, perplexity 2.800148, time 8.95 sec
 - 下雨怕岩"系它我的他之件所住说像未别他"， 头看一是月起的！生的做使给我很呢梦应的这 。—谓"物的说为我
 - 明天当苏里。""一的辉欢父这我的糊，子看天他的主要。道我立泪他便近这"微中道我的的是笼唉北位；的和肯没责
epoch 150, perplexity 2.073169, time 10.18 sec
 - 下雨如一那一。"出死会，的说眼！，了。一；认那我的着的受个， 你用已了好，得。会，慢，了以；"船已说好我
 - 明天当却他尽是但票。听的不书说我可们子对住的渐茫回尔将桥可，；信人的钱二阿经晚不脸月被了觉上，中"见我生
epoch 180, perplexity 1.724087, time 9.11 sec
 - 下雨如一那一。"门，女住卖唱得神国种右道不"，吧有。；。花在"如了，出在使，是一个样人很的上；只各，来照
 - 明天角买里醒异些秦？梦还的我的了热但定桥晚名的见了那之里来她，""别说正了齐于，，而的做—来些脸的呢从我
epoch 210, perplexity 1.467351, time 9.20 sec
 - 下雨如