In [7]:
with open('input.txt') as f:
    text = f.read()

In [9]:
print(text[:200])


欢迎来到经验时代 (Welcome to the Era of Experience)
David Silver, Richard S. Sutton

摘要 (Abstract)
我们正站在人工智能新纪元的门槛上，它有望实现前所未有的能力水平。新一代智能体（agent）将主要通过从经验中学习来获得超人的能力。本文探讨了定义这一即将到来的时代的关键特征。

人类数据时代 (The Era of 


In [10]:
# 字典、编码器(函数)、解码器(函数)
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}  #str_to_index
itos = {i:ch for i,ch in enumerate(chars)}  #index_to_str

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [13]:
encode('我们正站在人工智能新纪元的门槛')

[375, 115, 491, 602, 254, 103, 312, 450, 652, 435, 620, 151, 561, 785, 487]

In [14]:
decode([375, 115, 491, 602, 254, 103, 312, 450, 652, 435, 620, 151, 561, 785, 487])

'我们正站在人工智能新纪元的门槛'

In [15]:
import torch

# 文本转换token index
data = torch.tensor(encode(text), dtype=torch.long)

In [16]:
data[:10]

tensor([  0, 490, 741, 468, 182, 626, 829, 442, 112,   1])

In [21]:
# 拆分数据集
n = int(len(data) * .9)
train_data = data[:n]
val_data = data[n:]

In [22]:
# 训练文本采样长度
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'输入内容:{context} 预测的目标:{target}')

输入内容:tensor([0]) 预测的目标:490
输入内容:tensor([  0, 490]) 预测的目标:741
输入内容:tensor([  0, 490, 741]) 预测的目标:468
输入内容:tensor([  0, 490, 741, 468]) 预测的目标:182
输入内容:tensor([  0, 490, 741, 468, 182]) 预测的目标:626
输入内容:tensor([  0, 490, 741, 468, 182, 626]) 预测的目标:829
输入内容:tensor([  0, 490, 741, 468, 182, 626, 829]) 预测的目标:442
输入内容:tensor([  0, 490, 741, 468, 182, 626, 829, 442]) 预测的目标:112


In [23]:
torch.manual_seed(1337) # 复现实验场景和结果 

# 模型训练数据集
block_size = 8
batch_size = 4

def get_batch(split):
    # 选择训练或验证数据集
    data = train_data if split == 'train' else val_data

    # 动态从数据集中选择位置索引
    ix = torch.randint(len(data) - block_size, (batch_size,)) # [0,103846]随机生成位置索引，向后截取block_size字符训练
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x,y


In [24]:
# 方式测试
x,y = get_batch('train')

for b in range(batch_size):
    for t in range(block_size):
        context = x[b,:t+1]
        target = y[b,t]
        print(f'输入内容:{context} 预测的目标:{target}')

输入内容:tensor([32]) 预测的目标:27
输入内容:tensor([32, 27]) 预测的目标:1
输入内容:tensor([32, 27,  1]) 预测的目标:289
输入内容:tensor([ 32,  27,   1, 289]) 预测的目标:546
输入内容:tensor([ 32,  27,   1, 289, 546]) 预测的目标:94
输入内容:tensor([ 32,  27,   1, 289, 546,  94]) 预测的目标:186
输入内容:tensor([ 32,  27,   1, 289, 546,  94, 186]) 预测的目标:382
输入内容:tensor([ 32,  27,   1, 289, 546,  94, 186, 382]) 预测的目标:461
输入内容:tensor([674]) 预测的目标:689
输入内容:tensor([674, 689]) 预测的目标:621
输入内容:tensor([674, 689, 621]) 预测的目标:134
输入内容:tensor([674, 689, 621, 134]) 预测的目标:552
输入内容:tensor([674, 689, 621, 134, 552]) 预测的目标:1
输入内容:tensor([674, 689, 621, 134, 552,   1]) 预测的目标:13
输入内容:tensor([674, 689, 621, 134, 552,   1,  13]) 预测的目标:8
输入内容:tensor([674, 689, 621, 134, 552,   1,  13,   8]) 预测的目标:8
输入内容:tensor([561]) 预测的目标:141
输入内容:tensor([561, 141]) 预测的目标:225
输入内容:tensor([561, 141, 225]) 预测的目标:468
输入内容:tensor([561, 141, 225, 468]) 预测的目标:417
输入内容:tensor([561, 141, 225, 468, 417]) 预测的目标:136
输入内容:tensor([561, 141, 225, 468, 417, 136]) 预测的目标:273
输入内容:tensor([561, 141, 2

In [26]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

# 2-gram
class BingramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # 每个token都直接从Embedding中查询对应的logits值 以进行下一个token的推理
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        
        # idx值和targets值都是整型张量 (B,T)
        logits = self.token_embedding_table(idx)  # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx (B,T) 数组对应着当前的输入内容 [1,1]
        for _ in range(max_new_tokens):
            # 模型推理
            logits, loss = self(idx)  # (B,T) -> (1,1...100)
            # 获取最后一个时间步的输出
            logits = logits[:, -1, :]  # (1,100,65) -> (1,65)
            # 应用softmax转换为概率值
            probs = F.softmax(logits, dim=-1)  # (B,C)
            # 按权重值采样，返回对应的索引
            #idx_next = torch.argmax(probs, dim=-1)
            # 随机采样
            idx_next = torch.multinomial(probs,num_samples=1) # (B,1)
            # 应用采样后的索引
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1) [1,2],[1,3]... [1,max_new_tokens]
        return idx

In [27]:
m = BingramLanguageModel(vocab_size)
# 模型训练
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for step in range(1000):

    xb,yb = get_batch('train')
    # 推理计算损失
    logtis,loss = m(xb, yb)
    # backward
    loss.backward()
    optimizer.step()
    m.zero_grad(set_to_none=True)

print(loss.item())


6.290568828582764


In [28]:
# 模型推理
token_idx = torch.zeros((1,1), dtype=torch.long)
result = m.generate(token_idx, 500)
print(decode(result[0].tolist()))


维掘衡果题果之8眠之偏某[会须N饮属集暖器即下安平新食药拥本确就骤要效奥c2训念带信害优骤智然杨略下看才仿赞计处饿陆顿变仔n判e-低而协让仍附尽测6大4释据
4良破沿本锻几诱允随星地近促独么备式排证信边运劳甚首c增套作端弈移疗槛闭到距纠别审穿l片富生约抗印上到澡集馈还传法尽f扑f饥视变据掘种描承本教知价额排非弃诊积忧会位（法择空索直[特益认合达弈能低展响不担质不操客越似具奇必赛质告署过将今抛6却”让资延利网推任列顿景件T用过偏也论实穿只放念验续轨是为指把制刻级争全并怎音眼趋口段传合理媒过在断符预g饥刺日悠既真来展指整络乏竞工斥绕忧欢杨怎个讨告踪口清今造提片任战些每R略迎升反样降概减个延形备致续年社遍世键深摘神双误争训法q链察么未一高延感精交臂亿（2W改镜报网无较略过控暖断炼赞给提问境怎加诗锻户延观极医氏
媒苦微述感乏之因即趋操干建苦活神讨更所d库r链较器尽讨盘改级器星绌骤距牛镜既刻尺研概候法介围核次示准所至杂程分球整景轨S利掌进细管适绕控强识例计驱子匹澡于们险失硬?周使戴奥操赖变挑生中势去活布需引谱赛脱蛋析概供初S险 R为房棋其征率捉职错列统鼓述请岗立产环抛近略运动；下频蒂多后比l谢系


In [29]:
wei = torch.randn((4,4))

tril = torch.tril(torch.ones(4,4))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5049, 0.4951, 0.0000, 0.0000],
        [0.1070, 0.7552, 0.1378, 0.0000],
        [0.1032, 0.3188, 0.0831, 0.4950]])