In [66]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
# Block Size（块大小）是指在处理文本时，模型一次读取或生成的最大文本长度。
block_size = 8
# Batch Size（批大小）是指在一次训练迭代中，模型同时处理的样本数。
batch_size = 4

learning_rate = 1e-3
max_iters = 100


cpu


In [2]:

with open('ebook_free.text', 'r', encoding='UTF-8') as f:
    text = f.read()
print(len(text))
print(text[:200])  # 打印前200个字符


378436
The Project Gutenberg eBook of Hollyhock House: A Story for Girls

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no res


In [30]:
# 将每个字符找出来，做成词汇表
chars = sorted(set(text))
print(len(chars))
print(chars)
vocabulary_size = len(chars)  # 词汇表大小

96
['\n', ' ', '!', '#', '$', '%', '&', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'è', 'é', 'ê', 'ï', 'ô', 'ö', 'ü', '—', '‘', '’', '“', '”', '•', '™']


In [31]:

# 制作编码器解码器
# for i, ch in enumerate(chars):
#     print(f"{i}:{ch}")

string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: "".join([int_to_string[i] for i in l])

print(encode("hello"))
print(decode([63, 60, 67, 67, 70]))


[63, 60, 67, 67, 70]
hello


In [6]:

# 使用torch处理数据
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

# 分成训练集 验证集
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

tensor([46, 63, 60,  1, 42, 73, 70, 65, 60, 58, 75,  1, 33, 76, 75, 60, 69, 57,
        60, 73, 62,  1, 60, 28, 70, 70, 66,  1, 70, 61,  1, 34, 70, 67, 67, 80,
        63, 70, 58, 66,  1, 34, 70, 76, 74, 60, 24,  1, 27,  1, 45, 75, 70, 73,
        80,  1, 61, 70, 73,  1, 33, 64, 73, 67, 74,  0,  0, 46, 63, 64, 74,  1,
        60, 57, 70, 70, 66,  1, 64, 74,  1, 61, 70, 73,  1, 75, 63, 60,  1, 76,
        74, 60,  1, 70, 61,  1, 56, 69, 80, 70])


In [32]:

# 使用torch处理数据
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

# 分成训练集 验证集
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

tensor([46, 63, 60,  1, 42, 73, 70, 65, 60, 58, 75,  1, 33, 76, 75, 60, 69, 57,
        60, 73, 62,  1, 60, 28, 70, 70, 66,  1, 70, 61,  1, 34, 70, 67, 67, 80,
        63, 70, 58, 66,  1, 34, 70, 76, 74, 60, 24,  1, 27,  1, 45, 75, 70, 73,
        80,  1, 61, 70, 73,  1, 33, 64, 73, 67, 74,  0,  0, 46, 63, 64, 74,  1,
        60, 57, 70, 70, 66,  1, 64, 74,  1, 61, 70, 73,  1, 75, 63, 60,  1, 76,
        74, 60,  1, 70, 61,  1, 56, 69, 80, 70])


In [33]:

# 通过现在的输入x预测y，看看怎么回事吧
x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print("when input is", context, "target is", target)



when input is tensor([46]) target is tensor(63)
when input is tensor([46, 63]) target is tensor(60)
when input is tensor([46, 63, 60]) target is tensor(1)
when input is tensor([46, 63, 60,  1]) target is tensor(42)
when input is tensor([46, 63, 60,  1, 42]) target is tensor(73)
when input is tensor([46, 63, 60,  1, 42, 73]) target is tensor(70)
when input is tensor([46, 63, 60,  1, 42, 73, 70]) target is tensor(65)
when input is tensor([46, 63, 60,  1, 42, 73, 70, 65]) target is tensor(60)


In [79]:


# 引入batch size进行并行处理 提升性能
def get_batch(split):
    data = train_data if split == 'train' else val_data
    # 生成batch_size个随机数
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y


x, y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)


inputs:
tensor([[59, 24,  1, 92, 34, 60, 73,  1],
        [80, 70, 76,  1, 67, 60, 56, 73],
        [73, 59, 60, 69,  0, 61, 73, 70],
        [ 1,  1,  1,  1,  1, 32, 27, 29]])
targets:
tensor([[24,  1, 92, 34, 60, 73,  1, 57],
        [70, 76,  1, 67, 60, 56, 73, 69],
        [59, 60, 69,  0, 61, 73, 70, 68],
        [ 1,  1,  1,  1, 32, 27, 29, 35]])


In [41]:
# 创建embedding表
token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)

In [45]:
# 获取input
input_logits = token_embedding_table(x)
N1, N2, C = input_logits.shape
# [4,8,96]
# [batch_size,block_size,vocabulary_size]

In [46]:
x.shape

torch.Size([4, 8])

In [47]:
# 准备交叉熵函数的输入
input_logits = input_logits.view(N1 * N2, C)
targets = y.view(N1 * N2)

In [48]:
input_logits.shape

torch.Size([32, 96])

In [49]:
targets.shape

torch.Size([32])

In [50]:
# 计算loss
# 交叉熵函数 input(N,C) targets(N)
loss = F.cross_entropy(input_logits, targets)

In [51]:
loss

tensor(5.3405, grad_fn=<NllLossBackward0>)

In [54]:
gx = token_embedding_table(x)[:, -1, :]  

In [56]:
gx.shape

torch.Size([4, 96])

In [57]:
 probs = F.softmax(gx, dim=-1)

In [58]:
probs.shape

torch.Size([4, 96])

In [59]:
index_next = torch.multinomial(probs, num_samples=1)

In [60]:
index_next

tensor([[20],
        [22],
        [77],
        [42]])

In [61]:
x

tensor([[56, 59,  1, 56, 62, 56, 64, 69],
        [ 1, 57, 60, 60, 69,  1, 61, 70],
        [57, 73, 60, 56, 75, 63, 67, 60],
        [74, 75, 80,  1, 56, 74,  1, 56]])

In [62]:

class BigramLanguageModel(nn.Module):
    def __init__(self, vocabulary_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)

    # 前向传播
    def forward(self, input_index, targets):
        input_logits = self.token_embedding_table(input_index)
        N1, N2, C = input_logits.shape
        N = N1*N2
        input = input_logits.view(N, C)
        targets = targets.view(N)
        # 交叉熵函数 input(N,C) targets(N)
        loss = F.cross_entropy(input, targets)
        return input, loss

    # 获取logits
    def get_logits(self,input_index):
        return self.token_embedding_table(input_index)

    # 生成预测文本
    def generate(self, input_index, max_new_tokens):
        # input_index是当前上下文（B,T）数组的下标
        for _ in range(max_new_tokens):
            # get the predictions
            logits = self.get_logits(input_index)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B,C)
            probs = F.softmax(logits, dim=-1)  # (B,C)
            index_next = torch.multinomial(probs, num_samples=1)
            input_index = torch.cat((input_index, index_next), dim=1)  # (B,T+1)
        return input_index


In [63]:
model = BigramLanguageModel(vocabulary_size)

In [64]:
model.forward(x,y)

(tensor([[ 0.2128,  1.3049, -0.5040,  ..., -0.6487, -1.5353, -0.2212],
         [ 0.6039,  1.5666,  0.4391,  ...,  0.0611,  1.7136, -1.9198],
         [ 0.3596,  1.1008, -1.3913,  ..., -0.9286,  2.2723, -0.2173],
         ...,
         [-0.1482, -0.1238,  0.4463,  ...,  1.4030, -0.1056, -1.5375],
         [ 0.3596,  1.1008, -1.3913,  ..., -0.9286,  2.2723, -0.2173],
         [ 0.2128,  1.3049, -0.5040,  ..., -0.6487, -1.5353, -0.2212]],
        grad_fn=<ViewBackward0>),
 tensor(4.7015, grad_fn=<NllLossBackward0>))

In [65]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context, max_new_tokens=50)[0].tolist())
print(generated_chars)


NjEs”3VDSd,üOMS][LVL%RdiiR 5A/on“s8
“a)’
GZ31b,ê,0


In [80]:

@torch.no_grad
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()

    return out



In [85]:
max_iters=1000
eval_iters = 100
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step:{iter},loss:{losses}")

    xb, yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step:0,loss:{'train': tensor(4.7751), 'val': tensor(4.8006)}
step:100,loss:{'train': tensor(4.7352), 'val': tensor(4.7502)}
step:200,loss:{'train': tensor(4.6144), 'val': tensor(4.6214)}
step:300,loss:{'train': tensor(4.5553), 'val': tensor(4.6049)}
step:400,loss:{'train': tensor(4.4575), 'val': tensor(4.4807)}
step:500,loss:{'train': tensor(4.3733), 'val': tensor(4.4423)}
step:600,loss:{'train': tensor(4.3279), 'val': tensor(4.3370)}
step:700,loss:{'train': tensor(4.2131), 'val': tensor(4.2547)}
step:800,loss:{'train': tensor(4.2079), 'val': tensor(4.2177)}
step:900,loss:{'train': tensor(4.1163), 'val': tensor(4.1478)}
