In [7]:
import torch
import random
import zipfile

with zipfile.ZipFile('Datasets/jaychou_lyrics.txt.zip') as zin:  
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')
corpus_chars[:40]
#有一些任务，可能事先需要设置，事后做清理工作，with提供了一种方便的处理方式  不用处理关句柄，处理读取异常等
#f = zin.open('jaychou_lyrics.txt')
#f.close() // 

corpus_chars = corpus_chars.replace('\n',' ').replace('\t',' ')
#len(corpus_chars) #63282
corpus_chars = corpus_chars[0:10000]

In [20]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict((char,i) for i,char in enumerate(idx_to_char))
vocab_size = len(char_to_idx) #1027

corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:',''.join(idx_to_char[idx] for idx in sample))
print('indices:',sample)

chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [142, 214, 600, 614, 314, 513, 970, 142, 214, 390, 410, 159, 315, 511, 854, 604, 970, 142, 214, 390]


In [40]:
def data_iter_random(corpus_indices,batch_size,num_steps,device=None):
    num_examples = (len(corpus_indices) - 1)//num_steps
    epoch_size = num_examples // batch_size
    example_indices = list(range(num_examples))
    print(example_indices)
    random.shuffle(example_indices)
    
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(device)
    
    for i in range(epoch_size):
        i = i * batch_size
        batch_indices = example_indices[i:i+batch_size]
        X = [corpus_indices[j*num_steps:j*num_steps + num_steps]  for j in batch_indices]
        Y = [corpus_indices[j*num_steps+1:j*num_steps + num_steps+1]  for j in batch_indices]
        yield torch.tensor(X,dtype=torch.float32,device=device),torch.tensor(Y,dtype=torch.float32,device=device)
my_seq = list(range(31))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6,device='cpu'):
    print('X: ', X, '\nY:', Y, '\n')
    

def data_iter_consecutive(corpus_indices,batch_size,num_steps,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)
    batch_len = len(corpus_indices)//batch_size
    indices = corpus_indices[0:batch_size*batch_len].view(batch_size,batch_len)
    print(indices)
    epoch_size =(batch_len -1)//num_steps
    for i in range(epoch_size):
        i = i*num_steps
        X = indices[:,i:i+num_steps]  #多维数组切片
        Y = indices[:,i+1:i+num_steps+1]
        yield X,Y

for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=4,device='cpu'):
#RuntimeError: CUDA error: all CUDA-capable devices are busy or unavailable 
#所以暂时先用cpu，重启理论上是可以的，不过过段时间可能还是这样
    print('X: ', X, '\nY:', Y, '\n')


[0, 1, 2, 3, 4]
X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [12., 13., 14., 15., 16., 17.]]) 
Y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [13., 14., 15., 16., 17., 18.]]) 

X:  tensor([[24., 25., 26., 27., 28., 29.],
        [ 6.,  7.,  8.,  9., 10., 11.]]) 
Y: tensor([[25., 26., 27., 28., 29., 30.],
        [ 7.,  8.,  9., 10., 11., 12.]]) 

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
         14.],
        [15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,
         29.]])
X:  tensor([[ 0.,  1.,  2.,  3.],
        [15., 16., 17., 18.]]) 
Y: tensor([[ 1.,  2.,  3.,  4.],
        [16., 17., 18., 19.]]) 

X:  tensor([[ 4.,  5.,  6.,  7.],
        [19., 20., 21., 22.]]) 
Y: tensor([[ 5.,  6.,  7.,  8.],
        [20., 21., 22., 23.]]) 

X:  tensor([[ 8.,  9., 10., 11.],
        [23., 24., 25., 26.]]) 
Y: tensor([[ 9., 10., 11., 12.],
        [24., 25., 26., 27.]]) 

