# GPT Model


In [30]:

import torch
import torch.nn as nn
from torch.testing import assert_close
torch.manual_seed(42)
from torch.utils.data import DataLoader,Dataset
import tool,loaddata
from tqdm import tqdm



In [31]:
# ! pip install tqdm

### config

In [32]:
IS_SKIP_TEST =True

GPT_CONFIG = {
    "num_epochs":10,
    "batch_size":8,
    "vocab_size":100261, #50257,      # 词汇表大小
    "context_len": 256,  # 上下文长度
    "emb_dim": 512,          # 嵌入维度
    "n_heads": 8,           # 注意力头的数量
    "n_layers": 6,          # 层数
    "drop_rate": 0.1,        # dropout率
    "qkv_bias": False ,      # 查询-键-值偏置
}

TOKEN_TYPE="gpt2"
TOKEN_TYPE="cl100k_base"

LR= 4e-3
WEIGHT_DECAY =0.1

EVAL_FREQ = 50
EVAL_ITER = 50


In [33]:
! nvidia-smi

Tue Aug 12 19:31:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.59                 Driver Version: 561.19         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   69C    P8             18W /  140W |    5733MiB /   8192MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Define Test Model

In [35]:

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
    
    def forward(self,x):
        return x
    
class DummyLayerNorm(nn.Module):
    def __init__(self, norm_shape,eps=1e-5):
        super().__init__()
        
    def forward(self,x):
        return x
        

class DummyGPT(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_len'],cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks =  nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg['emb_dim'],cfg['vocab_size'],bias=False
        )
       
    def forward(self,in_idx):
        batch_size, seq_len = in_idx.shape  #in_idx 通常是一个整数张量（Tensor），形状一般为 (batch_size, seq_len)
        tok_embeds = self.tok_emb(in_idx) #(batch_size, seq_len, emb_dim)
        pos_embeds = self.pos_emb(torch.arange(seq_len,device=in_idx.device))  #生成一个从 0 到 seq_len-1 的整数序列
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
        
        

### view model parameters

In [36]:
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_dummyModel():
    model = DummyGPT(GPT_CONFIG)
    return model

test_dummyModel()

函数 test_dummyModel 已跳过执行


## Define layerNorm

In [37]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        
    def forward(self,x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim =-1 ,keepdim =True, unbiased =False)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x + self.shift

### test layerNorm

In [38]:
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_layer_norm():
    batch_size = 2
    seq_len = 5
    emb_dim = 3  
    x = torch.randn(batch_size, seq_len, emb_dim)  # 随机生成输入张量
    
    custom_ln = LayerNorm(emb_dim)
    official_ln = nn.LayerNorm(emb_dim, eps=1e-5, elementwise_affine=True)
    

    official_ln.weight.data.copy_(custom_ln.scale.data)
    official_ln.bias.data.copy_(custom_ln.shift.data)
 
    custom_out = custom_ln(x)
    official_out = official_ln(x)
    print(custom_out)
    print(official_out)

    assert_close(
        custom_out, 
        official_out, 
        rtol=1e-5,  # 相对误差容忍度
        atol=1e-5   # 绝对误差容忍度
    )
    print("测试通过：自定义LayerNorm与官方实现输出一致")

test_layer_norm()

函数 test_layer_norm 已跳过执行


## Define activate function

Φ(x) ≈ 0.5 * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))

In [39]:
class GELU(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def forward(self,x):
        return 0.5*x*(1+ 
                      torch.tanh(torch.sqrt(torch.tensor(2/torch.pi))
                                 *(x+0.044715*torch.pow(x,3))
                                )
                      )
        


### test gelu

In [40]:
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_gelu():
    x = torch.tensor([-3.0, -1.0, 0.0, 0.5, 1.0, 2.0, 5.0])
    
    custom_gelu = GELU()
    official_gelu = nn.GELU()

    custom_out = custom_gelu(x)
    official_out = official_gelu(x)
    
    # 打印结果进行直观对比
    print("输入值:", x)
    print("自定义GELU输出:", custom_out)
    print("官方GELU输出:", official_out)
 
    assert_close(
        custom_out,
        official_out,
        rtol=1e-3,  # 相对误差容忍度
        atol=1e-3   # 绝对误差容忍度
    )
    print("\n测试通过：自定义GELU与官方实现近似一致")
    
test_gelu()

函数 test_gelu 已跳过执行


## Define FFN
通过两层线性变换和激活函数，对注意力机制输出的特征进行非线性加工，增强模型表达能力。

In [41]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            #中间层hidden_dim通常设为4*emb_dim（如原始 Transformer 中为 512→2048→512），通过扩展维度捕捉更丰富的特征
            nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),  
            GELU(),
            nn.Dropout(cfg['drop_rate']),
            nn.Linear(4*cfg['emb_dim'],cfg['emb_dim'])
        )
        
    def forward(self,x):
        return self.layers(x)

## Define MultiAttention

In [42]:
class CausalAttention(nn.Module):
    def __init__(self, d_in, d_out,context_len,dropout,qkv_bias=False):
        super().__init__()
        self.d_out =d_out
        self.W_q = nn.Linear(d_in,d_out,bias= qkv_bias)
        self.W_k = nn.Linear(d_in,d_out,bias= qkv_bias)
        self.W_v = nn.Linear(d_in,d_out,bias= qkv_bias)
        self.dropout = nn.Dropout(dropout)
        #缓冲区（buffer）是模型中不需要被训练的参数（与 nn.Parameter 不同，后者是可学习参数），但会随模型一起保存（state_dict 中包含）
        self.register_buffer(
            'mask', 
            torch.triu(torch.ones(context_len,context_len),diagonal=1)
        )
    
    def forward(self,x):
        b,num_tokens,d_in = x.shape
        keys = self.W_k(x)
        queries = self.W_q(x)
        values = self.W_v(x)
        
        att_score = queries @ keys.transpose(1,2)
        att_score.masked_fill_(self.mask.bool()[:num_tokens,:num_tokens],-torch.inf) # 上面的register_buffer  形状为 (num_tokens, num_tokens) 的子矩阵
        att_weight = torch.softmax(att_score/keys.shape[-1]**0.5, dim=-1)
        att_weight = self.dropout(att_weight)
        context_vec = att_weight @ values
        return context_vec
        
class MultiHeadAttendtion(nn.Module):
    def __init__(self, d_in, d_out,context_len,dropout,num_heads,qkv_bias=False):
        super().__init__()
        # ModuleList与nn.Sequential不同，它不自动执行前向传播，而是需要手动遍历调用，适合需要单独处理每个子模块的场景
        self.heads = nn.ModuleList(
            [CausalAttention(d_in,d_out,context_len,dropout,qkv_bias) for _ in range(num_heads)]
        )
        
    def forward(self,x):
        return torch.cat([head(x) for head in self.heads],dim=-1)

In [43]:
# TODO 更高效的MutiAttention 减少计算量
#参数规模更小（d_model×d_model 对比 num_heads×d_model×head_dim
class MultiHeadAttendtion_new(nn.Module):
    def __init__(self, d_in, d_out,context_len,dropout,num_heads,qkv_bias=False):
        super().__init__()
        self.d_out =d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_q = nn.Linear(d_in,d_out,bias= qkv_bias)
        self.W_k = nn.Linear(d_in,d_out,bias= qkv_bias)
        self.W_v = nn.Linear(d_in,d_out,bias= qkv_bias)
        self.out_proj =nn.Linear(d_out,d_out) # out_proj 可以学习如何 “融合” 这些头的信息（例如对不同头的特征赋予不同权重），而不是简单保留原始拼接结果
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask', 
            torch.triu(torch.ones(context_len,context_len),diagonal=1)
        )
    
    def forward(self,x):
        b,num_tokens,d_in = x.shape
        keys = self.W_k(x)
        queries = self.W_q(x)
        values = self.W_v(x)
        
        keys = keys.view(b,num_tokens,self.num_heads,self.head_dim)
        queries = queries.view(b,num_tokens,self.num_heads,self.head_dim)
        values = values.view(b,num_tokens,self.num_heads,self.head_dim)
         
        #(b,num_tokens,num_heads,head_dim) --> (b,num_heads,num_tokens,head_dim)         
        keys = keys.transpose(1,2)
        queries = queries .transpose(1,2)
        values = values.transpose(1,2)
        
        
        att_score = queries @ keys.transpose(2,3)
        att_score.masked_fill_(self.mask.bool()[:num_tokens,:num_tokens],-torch.inf)
        att_weight = torch.softmax(att_score/keys.shape[-1]**0.5, dim=-1)
        att_weight = self.dropout(att_weight)
        context_vec = (att_weight @ values).transpose(1,2)
        context_vec = context_vec.contiguous().view(b,num_tokens,self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

## Define Transformer block

In [44]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttendtion_new(
            d_in= cfg["emb_dim"],
            d_out= cfg['emb_dim'],
            context_len=  cfg['context_len'],
            num_heads= cfg["n_heads"],
            dropout= cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff =FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim']) #norm1：用于注意力模块（self.att）的输入归一化
        self.norm2 = LayerNorm(cfg['emb_dim']) #norm2：用于前馈网络（self.ff）的输入归一化
        self.dropout = nn.Dropout(cfg['drop_rate'])
        
    
    def forward(self,x):
       # 注意力分支：LayerNorm -> 注意力 -> Dropout -> 残差连接
        x = x + self.dropout(self.att(self.norm1(x))) 
        # FFN分支：LayerNorm -> FFN -> Dropout -> 残差连接
        x = x + self.dropout(self.ff(self.norm2(x)))  
        return x
        
         

## Define GPT Model

In [45]:
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_len'],cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks =  nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg['emb_dim'],cfg['vocab_size'],bias=False
        )
       
    def forward(self,in_idx):
        batch_size, seq_len = in_idx.shape  #in_idx 通常是一个整数张量（Tensor），形状一般为 (batch_size, seq_len)
        tok_embeds = self.tok_emb(in_idx) #(batch_size, seq_len, emb_dim)
        pos_embeds = self.pos_emb(torch.arange(seq_len,device=in_idx.device))  #生成一个从 0 到 seq_len-1 的整数序列
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
        

### View structure of model 

In [46]:
#GPT2 小型（Small）：12 层 Transformer 解码器，隐藏层维度 768，注意力头数 12，总参数约 1.2 亿
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_GPT2_model():
    CONFIG = {
    "num_epochs":10,
    "batch_size":2,
    "vocab_size": 50257,     
    "context_len": 512,  
    "emb_dim": 768,          
    "n_heads": 8,          
    "n_layers": 12,          
    "drop_rate": 0.1,       
    "qkv_bias": False ,      
    }   
    model = GPTModel(CONFIG)
    model.to(device)

    # attention_new 参数减少量 = (304,556,544 - 163,008,000)
    total_params =sum(p.numel() for p in model.parameters())

    print(f"Total number of parameters: {total_params:,}") #163,008,000

    #权重共享， W_emb和W_out指向同一块内存，模型训练时只会更新这一个矩阵，避免了维护两个独立矩阵的开销
    total_params_gpt2 = total_params - sum(p.numel()for p in model.out_head.parameters())
   
    print(f"Number of trainable parameters "
        f"considering weight tying: {total_params_gpt2:,}") #124,017,408
    return model
    
test_GPT2_model()

函数 test_GPT2_model 已跳过执行


# Training model
## tokenizer

In [47]:
# ! pip install tiktoken

<|endoftext|>

<|fim_prefix|>

<|fim_middle|>

<|fim_suffix|>

<|endofprompt|>

In [48]:
import tiktoken


def text_to_tokenIds(text,tokenizer):
    encoded = tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor =torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def tokenIds_to_text(token_ids,tokenizer):
    flat =token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())



def  generate_text_greedy(model,idxs,max_new_tokens,context_size): # max_new_tokens 生成往后n个token
    model.eval()
    for _ in range(max_new_tokens):
        idx_condition = idxs[:,-context_size:]
        # print(idx_condition)
        with torch.no_grad():
            logits = model(idx_condition)
            # print(logits)
            
        #生成时：只需要最后一个位置的 logits
        logits = logits[:,-1,:]
        probas =torch.softmax(logits,dim=-1)
        idx_next = torch.argmax(probas,dim=-1,keepdim=True)
        idxs = torch.cat((idxs,idx_next),dim=1)
    return idxs


### test tokenizer

In [49]:
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_tokenizer():
    model =GPTModel(GPT_CONFIG)
    test_context ="今天的天气是晴天，适合出去走走"
    #test_context = "I like the weather"
    print(f'{test_context}--ori')
    tokenizer =tiktoken.get_encoding(TOKEN_TYPE)
    tokenids =text_to_tokenIds(test_context,tokenizer)
    print(f'{tokenIds_to_text(tokenids,tokenizer)}--recover') 


    tokenids = generate_text_greedy(model,tokenids,max_new_tokens=3,context_size=GPT_CONFIG['context_len'])

    print(f'{tokenIds_to_text(tokenids,tokenizer)}--new') 

test_tokenizer()


函数 test_tokenizer 已跳过执行


Epoch 过程中查看生成的文本

查看模型生成的新 token 数量（max_new_tokens）:
* 训练监控（最常用）：20-50 个 token
* 轻量化验证（追求效率）：10-20 个 token
* 深度观察（关键节点）：50-100 个 token

In [50]:

def generate_and_print(model,tokenizer,device,start_context,max_new_tokens):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_tokenIds(start_context,tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_greedy(model,idxs=encoded,max_new_tokens=max_new_tokens,context_size=context_size)
        decoded_text = tokenIds_to_text(token_ids,tokenizer)
    print(decoded_text.replace("\n"," "))
    model.train()
        

## Load Data

In [51]:



# class GPTDataset(Dataset):
#     def __init__(self,txt, tokenizer,max_len,stride):
#         super().__init__()
#         self.input_ids = []
#         self.target_ids =[]
#         self.max_len =max_len
#         tokenids = tokenizer.encode(txt)
#         print(f'tokens: {len(tokenids)}')
#         print(f'samples: {(len(tokenids) - max_len) // stride + 1}')
#         for i in range(0,len(tokenids)- max_len,stride):
#             input_chunk = tokenids[i:i+max_len]
#             target_chunk = tokenids[i+1:i+max_len+1]
#             self.input_ids.append(torch.tensor(input_chunk))
#             self.target_ids.append(torch.tensor(target_chunk))
            
#         print(f"total number of samples: {len(self.input_ids)}")
    
#     def __len__(self):
#         return len(self.input_ids)
    
#     def __getitem__(self,idx):
#         return self.input_ids[idx],self.target_ids[idx]

class GPTDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, stride):
        super().__init__()
        self.input_ids = []
        self.target_ids = []
        self.max_len = max_len
        self.stride = stride
        
        # 遍历文本数组中的每篇文本
        for idx, text in enumerate(tqdm(texts, desc="Process text")):
            # 跳过空文本
            if not text.strip():
                continue
                
            # 编码单篇文本
            tokenids = tokenizer.encode(text)
            # 计算该文本可生成的样本数
            num_samples = (len(tokenids) - max_len) // stride + 1 if len(tokenids) >= max_len else 0
            
            if num_samples > 0:
                # 滑动窗口生成样本
                for i in range(0, len(tokenids) - max_len, stride):
                    input_chunk = tokenids[i:i+max_len]
                    target_chunk = tokenids[i+1:i+max_len+1]  # 目标是输入的下一个token
                    self.input_ids.append(torch.tensor(input_chunk))
                    self.target_ids.append(torch.tensor(target_chunk))
        
        print(f"总样本数: {len(self.input_ids)}")
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    

'''
DataLoader 本质是一个批次生成器迭代索引：
自动生成从 0 到 len(dataset)-1 的索引，通过 dataset.__getitem__(idx) 逐个获取样本
'''
def GPTDataloader(txt,token_type,batch_size=4,max_len=246,stride=128,shuffle=True,drop_last=True,num_works=0):
    tokenizer =tiktoken.get_encoding(token_type)
    ds = GPTDataset(txt,tokenizer,max_len,stride)
    dl = DataLoader(
        ds,
        batch_size =batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_works
    )
    return dl

In [52]:
# load en txt for debug
'''
def Load_data_en(file_path,train_ratio=0.8):
    with open (file_path,"r",encoding="utf-8") as file:
        text_data =file.read()
        
    print(f'total char: {len(text_data)}') #character count
    
    split_idx = int(train_ratio*len(text_data))
    train_data = text_data[:split_idx]
    valid_data = text_data[split_idx:]
    print(f'train char: {len(train_data)}\nvalid char: {len(valid_data)} \n')
    return [train_data], [valid_data]

file_path ="../datasets/the-verdict.txt"
train_data, valid_data =Load_data_en(file_path)
'''

'\ndef Load_data_en(file_path,train_ratio=0.8):\n    with open (file_path,"r",encoding="utf-8") as file:\n        text_data =file.read()\n        \n    print(f\'total char: {len(text_data)}\') #character count\n    \n    split_idx = int(train_ratio*len(text_data))\n    train_data = text_data[:split_idx]\n    valid_data = text_data[split_idx:]\n    print(f\'train char: {len(train_data)}\nvalid char: {len(valid_data)} \n\')\n    return [train_data], [valid_data]\n\nfile_path ="../datasets/the-verdict.txt"\ntrain_data, valid_data =Load_data_en(file_path)\n'

In [53]:
def Load_data_cn(part=False,train_ratio=0.8):
    txts = loaddata.load_local_data()
    if part:
        txts =txts[:10000]
    len(txts)
    split_idx = int(train_ratio*len(txts))
    train_data = txts[:split_idx]
    valid_data = txts[split_idx:]
    print(f'train sentence: {len(train_data)}\nvalid sentence: {len(valid_data)} \n')
    return train_data,valid_data

train_data,valid_data = Load_data_cn(True,0.8)

dict_keys(['train'])
train sentence: 8000
valid sentence: 2000 



In [54]:


train_loader = GPTDataloader(
    train_data,
    TOKEN_TYPE,
    batch_size = GPT_CONFIG['batch_size'],
    max_len = GPT_CONFIG["context_len"],
    stride = GPT_CONFIG["context_len"] // 2,# 适度重叠（stride = max_len // 2）
    drop_last=True,
    shuffle= True, #训练时打乱，验证 / 测试时不打乱：训练时打乱是为了提升泛化能力
    num_works=4    
    )

# print("Train loader:")
# x, y =next(iter(train_loader))
# print(x.shape, y.shape)



print(F'共{len(train_loader)}个批次，'
      f'每批{train_loader.batch_size}个样本，'
      f'每个样本是长度为 {train_loader.dataset.max_len} 的 token 序列')

valid_loader = GPTDataloader(
    valid_data,
    TOKEN_TYPE,
    batch_size = GPT_CONFIG['batch_size'],
    max_len = GPT_CONFIG["context_len"],
    stride = GPT_CONFIG["context_len"] ,
    drop_last=False, # 验证 / 测试阶段：需要完整评估所有样本的性能，不能遗漏任何数据点
    shuffle= False, #验证 / 测试时不打乱是为了结果可复现，方便对比不同模型的性能
    num_works=4    
    )

# print("Validation loader:")
# x, y =next(iter(valid_loader))
# print(x.shape, y.shape)


Process text: 100%|██████████| 8000/8000 [00:03<00:00, 2081.51it/s]


总样本数: 43209
共5401个批次，每批8个样本，每个样本是长度为 256 的 token 序列


Process text: 100%|██████████| 2000/2000 [00:00<00:00, 3135.70it/s]

总样本数: 5608





### Loss funcion

In [55]:
def calc_loss_batch(input_batch,target_batch,model,device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss =torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())
    return loss

#快速验证：指定 num_batchs=n，只跑前n个批次，节省时间。
def calc_loss_loader(data_loader,model,device,num_batchs=None):
    total_loss = 0
    total_batchs =len(data_loader)
    # print('total batch count:' ,total_batchs)
    if  total_batchs == 0:
        return float('nan')
    elif num_batchs is None:
        num_batchs = total_batchs 
    else:
        num_batchs = min(num_batchs,total_batchs)
    
    for i ,(input_batch,target_batch) in enumerate(data_loader):# dataset.__getitem__(idx)
        if i < num_batchs:
            loss = calc_loss_batch(input_batch,target_batch,model,device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batchs

def evaluate_model(model,train_loader,valid_loader,device,eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader,model,device,num_batchs=eval_iter)
        valid_loss = calc_loss_loader(valid_loader,model,device,num_batchs=eval_iter)
    model.train()
    return train_loss,valid_loss

### test loss function

In [56]:
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_loss():
    model = GPTModel(GPT_CONFIG)
    calc_loss_loader(train_loader,model,device='cpu')

test_loss()

函数 test_loss 已跳过执行


## Train model

In [57]:
def train_model_process(model,train_loader,valid_loader,
                        optimizer,device,num_epochs,
                        eval_freq,eval_iter,
                        start_context,tokenizer):
    train_losses,val_losses= [],[]
    track_tokens_seen =[]
    tokens_seen,global_step = 0, -1
    for epoch in  tqdm(range(num_epochs), desc="training"):
        model.train()
        for input_batch,target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)  # 数据移到GPU
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch,target_batch,model,device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step +=1
            
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model,train_loader,valid_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(track_tokens_seen)
                print(f'EP: {epoch+1} STEP: {global_step} '
                    f'{f"T_LOSS: {train_loss:.3f}" if train_loss is not None else "T_LOSS: None "} '
                    f'{f"V_LOSS: {val_loss:.3f}" if val_loss is not None else "V_LOSS: None"}'
                    )
                generate_and_print(model,tokenizer,device,start_context,20)
    return train_losses,val_losses,track_tokens_seen

In [58]:
model =GPTModel(GPT_CONFIG)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
tokenizer =tiktoken.get_encoding(TOKEN_TYPE)
train_losses,val_losses,track_tokens_seen =train_model_process(model,train_loader,valid_loader,
                                                               optimizer,device,num_epochs=GPT_CONFIG['num_epochs'],
                                                               eval_freq=EVAL_FREQ,eval_iter=EVAL_ITER,
                                                               start_context="今天的天气",
                                                               tokenizer=tokenizer
                                                               )

training:   0%|          | 0/10 [00:00<?, ?it/s]

EP: 1 STEP: 0 T_LOSS: 24.345 V_LOSS: 24.406
今天的天气，，，，，，，，，，，，，，，，，，，，




EP: 1 STEP: 50 T_LOSS: 22.002 V_LOSS: 22.320
今天的天气，并行行，并行，并�语语在行语与行，，，行�行




EP: 1 STEP: 100 T_LOSS: 6.976 V_LOSS: 7.005
今天的天气��FI， -�，的的，的的 - -车，�， -�




EP: 1 STEP: 150 T_LOSS: 6.234 V_LOSS: 6.329
今天的天气员�书，但古岛和、的地，为 "的法的，




EP: 1 STEP: 200 T_LOSS: 5.977 V_LOSS: 6.056
今天的天气穗、，他们�，他们、黄近，但，古，




EP: 1 STEP: 250 T_LOSS: 5.923 V_LOSS: 6.028
今天的天气，但，但，但，但，但，但，但，但，但，但




EP: 1 STEP: 300 T_LOSS: 5.641 V_LOSS: 5.745
今天的天气员��ng，他们�的研员 * ablo��，




EP: 1 STEP: 350 T_LOSS: 5.461 V_LOSS: 5.566
今天的天气员会的222，但，但),。  �，但，但，但，但




EP: 1 STEP: 400 T_LOSS: 5.338 V_LOSS: 5.441
今天的天气员会议院363是一种Apom，但是一个 Public Policy Research，但是




EP: 1 STEP: 450 T_LOSS: 5.255 V_LOSS: 5.328
今天的天气员，因此，但，但，而，该，他们的，因为“




EP: 1 STEP: 500 T_LOSS: 5.216 V_LOSS: 5.259
今天的天气员，但是以及的热，但是一个的一些�的�




EP: 1 STEP: 550 T_LOSS: 5.180 V_LOSS: 5.248
今天的天气穆的研究，而这些，但是一起，但是




EP: 1 STEP: 600 T_LOSS: 5.076 V_LOSS: 5.201
今天的天气员 * 《 * 《 * 《 * 《 * 




EP: 1 STEP: 650 T_LOSS: 5.020 V_LOSS: 5.138
今天的天气究，这些的�，而言，而临的热，




EP: 1 STEP: 700 T_LOSS: 5.019 V_LOSS: 5.106
今天的天气员会议员会的破的价格612，但是一些的�




EP: 1 STEP: 750 T_LOSS: 4.965 V_LOSS: 5.071
今天的天气员的的人的的是由于是由于是在此的的的的的




EP: 1 STEP: 800 T_LOSS: 4.931 V_LOSS: 5.061
今天的天气员会，他的一种，因此外，他们的一种，他的




EP: 1 STEP: 850 T_LOSS: 4.912 V_LOSS: 5.027
今天的天气员会，但是一样，但是一些，Chronoperates，但是一




EP: 1 STEP: 900 T_LOSS: 4.899 V_LOSS: 5.013
今天的天气员会的病���的热�员会的病




EP: 1 STEP: 950 T_LOSS: 4.830 V_LOSS: 4.980
今天的天气究行动物种的烈的一些的烈的�




EP: 1 STEP: 1000 T_LOSS: 4.830 V_LOSS: 4.951
今天的天气员会议员Hell Maisenne Lee，而得到了这是一个的是由于




EP: 1 STEP: 1050 T_LOSS: 4.843 V_LOSS: 4.950
今天的天气员会的一些的一种，但是由于是Cardano的是由于




EP: 1 STEP: 1100 T_LOSS: 4.831 V_LOSS: 4.923
今天的天气员会议员会议员会议员，但是一些，但是一种




EP: 1 STEP: 1150 T_LOSS: 4.798 V_LOSS: 4.895
今天的天气员的人的机车的机车的机车车车车车车车站车


training:   0%|          | 0/10 [05:36<?, ?it/s]


KeyboardInterrupt: 

In [None]:
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    },
    "..\model\model_and_optimizer.pth"
)

In [None]:
checkpoint = torch.load("model_and_optimizer.pth", map_location=device)
model = GPTModel(GPT_CONFIG)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
