In [1]:
# 查看当前挂载的数据集目录
!ls /home/aistudio/data/

In [2]:
# 查看个人持久化工作区文件
!ls /home/aistudio/work/

!wget https://zh.d2l.ai/d2l-zh.zip

!unzip d2l-zh.zip

In [3]:
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Tue_Jun_12_23:07:04_CDT_2018
Cuda compilation tools, release 9.2, V9.2.148


In [4]:
!nvidia-smi

Tue Jul  2 17:05:39 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.37                 Driver Version: 396.37                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:0A.0 Off |                    0 |
| N/A   34C    P0    41W / 300W |      0MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [5]:
!pip install mxnet-cu92==1.4.0

!pip install d2lzh==0.8.11

In [6]:
import d2lzh as d2l
ctx = d2l.try_gpu()
print('will use', ctx)

will use gpu(0)


# 语言模型数据集
## 读取数据集

In [7]:
from mxnet import nd
import random
import zipfile

with zipfile.ZipFile('jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars=f.read().decode('utf-8')

corpus_chars.replace('\n',' ')[:5]

'想要有直升'

In [8]:
##将转义字符转换成空格
corpus_chars=corpus_chars.replace('\n',' ').replace('\r',' ')
print(len(corpus_chars))

##建立字符映射
idx_to_char=list(set(corpus_chars))
char_to_idx=dict([(char,idx) for idx,char in enumerate(idx_to_char)])
vocab_size=len(idx_to_char)
print(vocab_size)

##将每个字符转换成索引
corpus_indices=[char_to_idx[char] for char in corpus_chars]
print(corpus_indices[0:20])

63282
2582
[2033, 2338, 1421, 121, 2474, 20, 733, 2033, 2338, 2422, 663, 310, 699, 1313, 2564, 2340, 733, 2033, 2338, 2422]


## 时序数据的采样
### 随机采样
*相邻的两个随机小批量在原始序列上的weizhi位置不一定相毗邻*

In [9]:
def data_iter_random(corpus_indices, batch_size, num_steps, ctx=None):
    num_examples=(len(corpus_indices)-1)//num_steps
    epoch_size=num_examples//batch_size
    example_indices=list(range(num_examples))
    random.shuffle(example_indices)
    
    def _data(pos):
        return corpus_indices[pos:pos+num_steps]
        
    for i in range(epoch_size):
        i=i*batch_size
        batch_indices=example_indices[i:i+batch_size]
        x=[_data(j*num_steps) for j in batch_indices]
        y=[_data(j*num_steps+1) for j in batch_indices]
        yield nd.array(x, ctx), nd.array(y, ctx)

In [10]:
my_seq=list(range(30))
for x,y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('x:',x,'\ny:',y,'\n')

x: 
[[12. 13. 14. 15. 16. 17.]
 [18. 19. 20. 21. 22. 23.]]
<NDArray 2x6 @cpu(0)> 
y: 
[[13. 14. 15. 16. 17. 18.]
 [19. 20. 21. 22. 23. 24.]]
<NDArray 2x6 @cpu(0)> 

x: 
[[ 0.  1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10. 11.]]
<NDArray 2x6 @cpu(0)> 
y: 
[[ 1.  2.  3.  4.  5.  6.]
 [ 7.  8.  9. 10. 11. 12.]]
<NDArray 2x6 @cpu(0)> 



### 相邻采样

In [65]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
    corpus_indices = nd.array(corpus_indices, ctx=ctx)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0: batch_size*batch_len].reshape((
        batch_size, batch_len))
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y

In [66]:
my_seq=list(range(30))
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y, '\n')

X:  
[[ 0.  1.  2.  3.  4.  5.]
 [15. 16. 17. 18. 19. 20.]]
<NDArray 2x6 @cpu(0)> 
Y: 
[[ 1.  2.  3.  4.  5.  6.]
 [16. 17. 18. 19. 20. 21.]]
<NDArray 2x6 @cpu(0)> 

X:  
[[ 6.  7.  8.  9. 10. 11.]
 [21. 22. 23. 24. 25. 26.]]
<NDArray 2x6 @cpu(0)> 
Y: 
[[ 7.  8.  9. 10. 11. 12.]
 [22. 23. 24. 25. 26. 27.]]
<NDArray 2x6 @cpu(0)> 



# RNN从零实现

In [13]:
import d2lzh as d2l
import math 
from mxnet import autograd,nd
from mxnet.gluon import loss as gloss
import time

(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()

## one-hot向量

In [14]:
nd.one_hot(nd.array([0,2]), vocab_size)


[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
<NDArray 2x1027 @cpu(0)>

In [26]:
def to_onehot(X, size):
    return [nd.one_hot(x, size) for x in X.T]

X=nd.arange(10).reshape((2,5))
inputs=to_onehot(X, vocab_size)
len(inputs), inputs[0].shape

(5, (2, 1027))

## 初始化模型参数
$$H_{t} = \phi ( X_{t} W_{xh}+H_{t-1} W_{hh}+b_{h})$$

$$O_t=H_t W_{hq}+b_q$$

In [28]:
num_inputs, num_hiddens, num_outputs=vocab_size, 256, vocab_size
ctx=d2l.try_gpu()
print('will use', ctx)

def get_params():
    def _one(shape):
        return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)
    
    W_xh=_one((num_inputs,num_hiddens))
    W_hh=_one((num_hiddens,num_hiddens))
    b_h=nd.zeros(num_hiddens,ctx=ctx)
            
    W_hq=_one((num_hiddens,num_outputs))
    b_q=nd.zeros(num_outputs,ctx=ctx)
            
    params=[W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.attach_grad()
    return params

will use gpu(0)


### 定义模型

In [31]:
def init_rnn_state(batch_size, num_hiddens, ctx):
    return (nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx), )

def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs=[]
    for X in inputs:
        H = nd.tanh(nd.dot(X,W_xh)+nd.dot(H,W_hh)+b_h)
        Y=nd.dot(H, W_hq)+b_q
        outputs.append(Y)
    return outputs, (H,)

In [32]:
state=init_rnn_state(X.shape[0], num_hiddens, ctx)
inputs=to_onehot(X.as_in_context(ctx), vocab_size)
params=get_params()
outputs, state_new=rnn(inputs, state, params)
len(outputs), outputs[0].shape, state_new[0].shape

(5, (2, 1027), (2, 256))

### 定义预测函数

In [42]:
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, id_to_char, char_to_idx):
    state=init_rnn_state(1, num_hiddens, ctx)
    output=[char_to_idx[prefix[0]]]
    for t in range(num_chars+len(prefix)-1):
        X=to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size)
        (Y, state)=rnn(X, state, params)
        print(Y)
        if t<len(prefix)-1:
            output.append(char_to_idx[prefix[t+1]])
        else:
            output.append(int(Y[0].argmax(axis=1).asscalar()))
    return ''.join([idx_to_char[i] for i in output])

In [43]:
predict_rnn('分开', 10, rnn, params, init_rnn_state, num_hiddens, vocab_size,ctx, idx_to_char, char_to_idx)

[
[[-0.00235707  0.00058878  0.00084467 ...  0.00044898 -0.00102336
   0.00032249]]
<NDArray 1x1027 @gpu(0)>]
[
[[ 0.00148065  0.00114291 -0.00038911 ... -0.0002314  -0.0022492
  -0.00142049]]
<NDArray 1x1027 @gpu(0)>]
[
[[ 0.00137545  0.00141558 -0.00186955 ...  0.00060113 -0.00070479
  -0.00119103]]
<NDArray 1x1027 @gpu(0)>]
[
[[ 0.00274478 -0.00279694 -0.00025773 ...  0.00143543 -0.00013176
  -0.00072929]]
<NDArray 1x1027 @gpu(0)>]
[
[[-2.3358078e-03  1.6226274e-03  5.0106470e-04 ... -5.5089960e-05
  -1.1883190e-04  6.5149204e-04]]
<NDArray 1x1027 @gpu(0)>]
[
[[-0.00254922 -0.00075551 -0.00086057 ... -0.00206103 -0.00120417
  -0.00375299]]
<NDArray 1x1027 @gpu(0)>]
[
[[-0.00236148  0.00024627 -0.00022458 ...  0.00217802  0.00056818
  -0.00164119]]
<NDArray 1x1027 @gpu(0)>]
[
[[ 2.2049040e-05 -3.2432147e-03 -1.0718517e-03 ...  7.1719434e-05
   7.7870721e-04  1.4492834e-03]]
<NDArray 1x1027 @gpu(0)>]
[
[[ 2.3618273e-03 -5.2169413e-05 -1.1347601e-03 ... -1.6997540e-03
  -2.0093662e-03 

'分开化们斗腐色失共马忙碰'

### 裁剪梯度
$$\min\left(\frac{\theta}{\|g\|}, 1\right)g$$

In [44]:
def grad_clipping(params, theta, ctx):
    norm=nd.array([0], ctx)
    for param in params:
        norm+=(param.grad**2).sum()
    norm=norm.sqrt().asscalar()
    if norm>theta:
        for param in params:
            param.grad[:]*=theta/norm

### 困惑度
*困惑度是对交叉熵损失函数做指数运算后得到的值*

### 定义训练模型参数

In [61]:
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, 
corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, 
clipping_theta, batch_size, pred_period, pred_len, prefixes):
    if is_random_iter:
        data_iter_fn=d2l.data_iter_random
    else:
        data_iter_fn=d2l.data_iter_consecutive
    params=get_params()
    loss=gloss.SoftmaxCrossEntropyLoss()
    
    for epoch in range(num_epochs):
        if not is_random_iter:
            state=init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start =0.0, 0, time.time()
        data_iter=data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:
                state=init_rnn_state(batch_size, num_hiddens, ctx)
            else:
                for s in state:
                    s.detach()
            with autograd.record():
                inputs=to_onehot(X, vocab_size)
                (outputs, state)=rnn(inputs, state, params)
                outputs=nd.concat(*outputs, dim=0)
                y=Y.T.reshape((-1,))
                l=loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            d2l.sgd(params, lr, 1)
            l_sum+=l.asscalar()*y.size
            n+=y.size
        if (epoch+1)%pred_period==0:
            print('epoch%d, perplexity%f, time%.2f sec'%(
                epoch+1, math.exp(l_sum/n), time.time()-start))
            for prefix in prefixes:
                print(' -', predict_rnn
                (prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, 
                idx_to_char, char_to_idx))

### 训练模型并创造歌

In [62]:
num_epochs, num_steps, batch_size, lr, clipping_theta=250, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']

#### 使用随机采样

In [63]:
train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, 
corpus_indices, idx_to_char, char_to_idx, True, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)

#### 使用相邻采样

In [64]:
train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size,ctx, 
corpus_indices, idx_to_char, char_to_idx, False, num_epochs, num_steps, lr, clipping_theta, 
batch_size, pred_period, pred_len, prefixes)