In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/camel-xiangzi-1/Camel Xiangzi.txt


In [2]:
##版本2，加上下三角掩码矩阵
import torch
import torch.nn as nn
import torch.nn.functional as F

##模型训练数据集 尽量使用gpu ,kaggle 


def get_batch(split): ##split区别是训练和验证数据集
    ##选择训练或者验证数据集
    data=train_data if split=='train' else val_data
    ##动态从数据集中选择一个位置索引
    ix=torch.randint(len(data)-block_size-1,(batch_size,)) ##随机生成位置索引，向后截取block_size个字符作为训练
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])

    x=x.to(device)
    y=y.to(device)
    return x,y

class Head(nn.Module):
     """单头 self-attention"""
     def __init__(self,n_embd):
         super().__init__()
         self.key=nn.Linear(n_embd,n_embd,bias=False)
         self.query=nn.Linear(n_embd,n_embd,bias=False)
         self.value=nn.Linear(n_embd,n_embd,bias=False)
     def forward(self,input_x):
         B,T,C=input_x.shape

         k=self.key(input_x)
         q=self.query(input_x)
         v=self.value(input_x)

         wei=q@k.transpose(-2,-1)*C**-0.5
         T=wei.shape[-1]
         tril=torch.tril(torch.ones(T,T,device=wei.device))
         wei=wei.masked_fill(tril==0,float('-inf'))
         wei=wei.softmax(dim=-1)

         out=wei@v
         return out


    

class BingramLanguageModel(nn.Module):
    def __init__(self,block_size, vocab_size,n_embd):
        super().__init__()
 # 每个token都直接从Embedding中查询对应的logits值 以进⾏下⼀个token的推理
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        ##位置编码
        self.position_embedding_table=nn.Embedding(block_size,n_embd)      
         ##  one  head self-attention
        self.sa_head=Head(n_embd)
        #large model forward
        self.lm_head=nn.Linear(n_embd,vocab_size)
    def forward(self, idx, targets=None):
        B,T=idx.shape
 
 # idx值和targets值都是整型张量 (B,T)
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        tok_emb.to(device)
        pos_emb=self.position_embedding_table(torch.arange(T,device=device))
        x=tok_emb+pos_emb
        x=self.sa_head(x)
        logits=self.lm_head(x)
 
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
 
        return logits, loss
 
    def generate(self, idx, max_new_tokens):
 # idx (B,T) 数组对应着当前的输⼊内容 [1,1]
        for _ in range(max_new_tokens):
            # 模型推理
            ##限定索引列的取值范围
            idx_cond=idx[:,-block_size:]
            logits, _ = self(idx_cond) # (B,T) -> (1,1...100)
            # 获取最后⼀个时间步的输出
            logits = logits[:, -1, :] # (1,100,65) -> (1,65)
            # 应⽤softmax转换为概率值
            probs = F.softmax(logits, dim=-1) # (B,C)
            # 按权重值采样，返回对应的索引
            # idx_next = torch.argmax(probs, dim=-1)
            # 随机采样
            idx_next = torch.multinomial(probs,num_samples=1) # (B,1)
            # 应⽤采样后的索引
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1) [1,2],[1,3]... [1,max_new_tokens]
        return idx

In [3]:
block_size=8
batch_size=32
max_iter=10000
learn_rate=1e-3
device='cuda' if torch.cuda.is_available() else 'cpu'
n_embd=32
eval_interval=500
eval_iters=200

file_name='/kaggle/input/camel-xiangzi-1/Camel Xiangzi.txt'

with open(file_name) as f:
    text=f.read()

###词典 编码器(函数），解码器（函数）
chars=sorted(list(set(text)))


stoi={ ch:i for  i,ch in enumerate(chars)}
itos={ i:ch for  i,ch in enumerate(chars)}
vocab_size=len(stoi)

encode=lambda s:  [ stoi[c] for c in s ]
decode=lambda l:  ''.join( itos[i] for i in l )


##文本转换 token index 
data=torch.tensor(encode(text),dtype=torch.long)

#拆分数据集
n=int(len(data)*.9)

train_data=data[:n]
val_data=data[n:]

###模型训练
model=BingramLanguageModel(block_size,vocab_size,n_embd)
model.to(device)
optimizer=torch.optim.AdamW(model.parameters(),lr=learn_rate)
@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in ['train','val']:
        losses=torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y=get_batch(split)
            X.to(device),Y.to(device)
            logits,loss=model(X,Y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out 


In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # 放在代码最开头
os.environ['TORCH_USE_CUDA_DSA'] = '1'     # 启用设备端断言详情


for iter in range(max_iter):
    if iter % eval_interval==0:
        losses=estimate_loss()
        print(f"step {iter} :train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    ##批次样本
    xb,yb=get_batch('train')

    logits,loss=model(xb,yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
# 通过模型⽣成
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))



step 0 :train loss 7.9296, val loss 7.9365
step 500 :train loss 5.8629, val loss 6.0876
step 1000 :train loss 5.6886, val loss 5.9427
step 1500 :train loss 5.3220, val loss 5.6958
step 2000 :train loss 5.1150, val loss 5.5712
step 2500 :train loss 4.9691, val loss 5.4449
step 3000 :train loss 4.8068, val loss 5.3786
step 3500 :train loss 4.7457, val loss 5.3328
step 4000 :train loss 4.6489, val loss 5.2738
step 4500 :train loss 4.5693, val loss 5.2663
step 5000 :train loss 4.5463, val loss 5.2186
step 5500 :train loss 4.4914, val loss 5.1961
step 6000 :train loss 4.4568, val loss 5.1789
step 6500 :train loss 4.4194, val loss 5.1686
step 7000 :train loss 4.3856, val loss 5.1738
step 7500 :train loss 4.3543, val loss 5.1669
step 8000 :train loss 4.3339, val loss 5.1860
step 8500 :train loss 4.2837, val loss 5.1443
step 9000 :train loss 4.2778, val loss 5.1308
step 9500 :train loss 4.2622, val loss 5.1666

　　“放，钱不怎好象承认识了话？”

　　靠！”嘴。一流恰巧的圈儿，咱们，堵着，就是外。一老确收的书风拦住—哼数老头，便不愿意祥子又天还是不得词里。说什么儿找他不是工