# 读取数据集

In [1]:
import torch
import random
import zipfile
with zipfile.ZipFile('./Datasets/jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars=f.read().decode('utf-8')
corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

这个数据集有6万多个字符。为了打印⽅便，我们把换⾏符替换成空格，然后仅使⽤前1万个字符来训练
模型

In [2]:
corpus_chars=corpus_chars.replace('\n',' ').replace('\r',' ')
corpus_chars=corpus_chars[0:10000]

# 建立字符索引

我们将每个字符映射成⼀个从0开始的连续整数，⼜称索引，来⽅便之后的数据处理。为了得到索引，我们将数据集⾥所有不同字符取出来，然后将其逐⼀映射到索引来构造词典。接着，打印 vocab_size ，即词典中不同字符的个数，⼜称词典⼤⼩

In [3]:
idx_to_char=list(set(corpus_chars))
char_to_idx=dict([(char,i) for i,char in enumerate(idx_to_char)])
vocab_size=len(char_to_idx)
vocab_size

1027

In [4]:
corpus_indices=[char_to_idx[char] for char in corpus_chars]
sample=corpus_indices[:20]
print('chars:',''.join([idx_to_char[idx] for idx in sample]))
print('indices:',sample)

chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [566, 450, 927, 301, 141, 115, 717, 566, 450, 237, 27, 286, 18, 718, 640, 405, 717, 566, 450, 237]


# 时序数据的采样

在训练中我们需要每次随机读取⼩批量样本和标签。与之前章节的实验数据不同的是，时序数据的⼀个样本通常包含连续的字符。假设时间步数为5，样本序列为5个字符，即“想”“要”“有”“直”“升”。该样本的标签序列为这些字符分别在训练集中的下⼀个字符，即“要”“有”“直”“升”“机”。

## 随机采样

In [5]:
def data_iter_random(corpus_indices,batch_size,num_steps,device=None):
    # 减1是因为输出的索引x是相应输⼊的索引y加1
    num_examples=(len(corpus_indices)-1)//num_steps
    epoch_size=num_examples//batch_size
    example_indices=list(range(num_examples))
    random.shuffle(example_indices)
    
    def _data(pos):
        return corpus_indices[pos:pos+num_steps]
    if device is None:
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(epoch_size):
        i=i*batch_size
        batch_indices=example_indices[i:i+batch_size]
        X=[_data(j*num_steps) for j in batch_indices]
        Y=[_data(j*num_steps+1) for j in batch_indices]
        yield torch.tensor(X,dtype=torch.float32,device=device),torch.tensor(Y,dtype=torch.float32,device=device)


In [6]:
my_seq=list(range(30))
for X,Y in data_iter_random(my_seq,batch_size=2,num_steps=6):
    print('X:',X,'\nY:',Y,'\n')

X: tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [12., 13., 14., 15., 16., 17.]], device='cuda:0') 
Y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [13., 14., 15., 16., 17., 18.]], device='cuda:0') 

X: tensor([[18., 19., 20., 21., 22., 23.],
        [ 0.,  1.,  2.,  3.,  4.,  5.]], device='cuda:0') 
Y: tensor([[19., 20., 21., 22., 23., 24.],
        [ 1.,  2.,  3.,  4.,  5.,  6.]], device='cuda:0') 



## 相邻采样

In [7]:
def data_iter_consecutive(corpus_indices,batch_size,num_steps,device=None):
    if device is None:
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_indices=torch.tensor(corpus_indices,dtype=torch.float32,device=device)
    data_len=len(corpus_indices)
    batch_len=data_len//batch_size
    indices=corpus_indices[0:batch_size*batch_len].view(batch_size,batch_len)
    epoch_size=(batch_len-1)//num_steps
    for i in range(epoch_size):
        i=i*num_steps
        X=indices[:,i:i+num_steps]
        Y=indices[:,i+1:i+num_steps+1]
        yield X,Y

In [9]:
for X,Y in data_iter_consecutive(my_seq,batch_size=2,num_steps=6):
    print('X:',X,'\nY:',Y)

X: tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [10., 11., 12., 13., 14., 15.],
        [20., 21., 22., 23., 24., 25.]], device='cuda:0') 
Y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [11., 12., 13., 14., 15., 16.],
        [21., 22., 23., 24., 25., 26.]], device='cuda:0')
