<a href="https://colab.research.google.com/github/yananma/5_programs_per_day/blob/master/02154.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 6.3 语言模型数据集 ( 周杰伦专辑歌词 )

### 6.3.1 读取数据集

In [0]:
import torch 
import random 
import zipfile

In [0]:
!mkdir ../../data

In [3]:
!git clone https://github.com/ShusenTang/Dive-into-DL-PyTorch.git

Cloning into 'Dive-into-DL-PyTorch'...
remote: Enumerating objects: 1692, done.[K
Receiving objects:   0% (1/1692)   Receiving objects:   1% (17/1692)   Receiving objects:   2% (34/1692)   Receiving objects:   3% (51/1692)   Receiving objects:   4% (68/1692)   Receiving objects:   5% (85/1692)   Receiving objects:   6% (102/1692)   Receiving objects:   7% (119/1692)   Receiving objects:   8% (136/1692)   Receiving objects:   9% (153/1692)   Receiving objects:  10% (170/1692)   Receiving objects:  11% (187/1692)   Receiving objects:  12% (204/1692)   Receiving objects:  13% (220/1692)   Receiving objects:  14% (237/1692)   Receiving objects:  15% (254/1692), 172.01 KiB | 319.00 KiB/s   Receiving objects:  16% (271/1692), 172.01 KiB | 319.00 KiB/s   Receiving objects:  17% (288/1692), 172.01 KiB | 319.00 KiB/s   Receiving objects:  18% (305/1692), 172.01 KiB | 319.00 KiB/s   Receiving objects:  18% (318/1692), 1.20 MiB | 1.12 MiB/s   Receiving objects:  19% (322/1692

In [0]:
!cp Dive-into-DL-PyTorch/data/jaychou_lyrics.txt.zip ../../data

In [5]:
with zipfile.ZipFile('../../data/jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')
corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [0]:
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[0:10000]

### 6.3.2 建立字符索引

In [7]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size

1027

In [8]:
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [181, 383, 133, 494, 31, 937, 162, 181, 383, 1024, 740, 819, 939, 209, 294, 873, 162, 181, 383, 1024]


### 6.3.3 时序数据的采样

#### 1 随机采样

In [0]:
def data_iter_random(corpus_indices, batch_size, num_steps, device=None):
    num_examples = (len(corpus_indices) - 1) // num_steps 
    epoch_size = num_examples // batch_size 
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)

    def _data(pos):
        return corpus_indices[pos: pos + num_steps]
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for i in range(epoch_size):
        i = i * batch_size 
        batch_indices = example_indices[i: i + batch_size]
        X = [_data(j * num_steps) for j in batch_indices]
        Y = [_data(j * num_steps + 1) for j in batch_indices]
        yield torch.tensor(X, dtype=torch.float32, device=device), torch.tensor(Y, dtype=torch.float32, device=device)

In [16]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY: ', Y, '\n')

X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [12., 13., 14., 15., 16., 17.]], device='cuda:0') 
Y:  tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [13., 14., 15., 16., 17., 18.]], device='cuda:0') 

X:  tensor([[18., 19., 20., 21., 22., 23.],
        [ 6.,  7.,  8.,  9., 10., 11.]], device='cuda:0') 
Y:  tensor([[19., 20., 21., 22., 23., 24.],
        [ 7.,  8.,  9., 10., 11., 12.]], device='cuda:0') 



#### 2 相邻采样

In [0]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size 
    indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)
    epoch_size = (batch_len - 1) // num_steps 
    for i in range(epoch_size):
        i = i * num_steps 
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y

In [18]:
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY', Y, '\n')

X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]], device='cuda:0') 
Y tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]], device='cuda:0') 

X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]], device='cuda:0') 
Y tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]], device='cuda:0') 

