# nanogpt


## 1. 导入相关的包


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from dataclasses import dataclass
import math

torch.manual_seed(1024) #  设置随机种子

<torch._C.Generator at 0x7f5d3591f170>

## 2. 定义 GPT 的参数


In [9]:
@dataclass
class GPTConfig:
    block_size : int = 512 # 序列长度 max_seq_len
    batch_size : int = 12
    n_layer : int = 12
    n_head : int = 12
    n_embed : int = 768  # embed_size=hidden_size=n_embed
    hidden_dim : int = n_embed
    dropout : float = 0.1
    head_size : int = n_embed // n_head
    vocab_size : int = 50257 # 50257 is the number of tokens in the GPT-2 vocabulary

## 3. 定义 GPT 的结构


In [10]:
#1. multi-head attention
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.key = nn.Linear(config.hidden_dim, config.hidden_dim)
        self.query = nn.Linear(config.hidden_dim, config.hidden_dim)
        self.value = nn.Linear(config.hidden_dim, config.hidden_dim) 
        # 使用register_buffer来注册一个常量张量，这个常量张量是一个下三角矩阵，用于计算注意力分数
        self.register_buffer(
            "attention_mask",
            torch.tril(torch.ones(config.block_size, config.block_size))
        )
        self.dropout = nn.Dropout(config.dropout)
        self.out = nn.Linear(config.hidden_dim, config.hidden_dim)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()  #x的shape是（batch_size, seq_len, hidden_dim）
        #print(f"x shape: {x.shape}")
        q=self.query(x)  #q的shape是（batch_size, seq_len, hidden_dim）
        k=self.key(x)
        v=self.value(x)
        #print(f"q shape: {q.shape}")

        q=q.view(batch_size, seq_len, self.config.n_head, self.config.head_size) #q的shape是（batch_size, seq_len, n_head, head_size）
        k=k.view(batch_size, seq_len, self.config.n_head, self.config.head_size)
        v=v.view(batch_size, seq_len, self.config.n_head, self.config.head_size)
        #print(f"q shape: {q.shape}")


        q=q.transpose(1,2) #q的shape是（batch_size, n_head, seq_len, head_size）
        k=k.transpose(1,2)
        v=v.transpose(1,2) 
        #print(f"q shape: {q.shape}")
        #print(f"k.transpose(-2,-1) shape: {k.transpose(-2,-1).shape}")

        product=torch.matmul(q,k.transpose(-2,-1)) # product的shape是（batch_size, n_head, seq_len, seq_len）
        #print(f"product shape: {product.shape}")
        #print(f"attention_mask shape: {self.attention_mask.shape}")q
        product=product.masked_fill(self.attention_mask[:seq_len, :seq_len] == 0, float('-inf'))
        product=product/math.sqrt(self.config.head_size)
        # 计算注意力分数
        attention_scores=self.dropout(F.softmax(product,dim=-1))

        out=torch.matmul(attention_scores,v)
        out=out.transpose(1,2).contiguous().view(batch_size, seq_len, self.config.n_embed)
        return self.out(out)
        
#2. feed-forward network（MLP）
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(config.hidden_dim, config.hidden_dim*4),
            nn.ReLU(),
            nn.Linear(config.hidden_dim*4, config.hidden_dim),
            nn.Dropout(config.dropout),
        )
    def forward(self, x):
        return self.net(x)
    
#3. transformer block
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn=MultiHeadAttention(config)
        self.mlp=FeedForward(config)
        self.norm1=nn.LayerNorm(config.hidden_dim)
        self.norm2=nn.LayerNorm(config.hidden_dim)

    def forward(self, x):
        x=x+self.attn(self.norm1(x))  #add&norm 将第一个norm放在最前面
        x=x+self.mlp(self.norm2(x))
        return x
    
#4. GPT
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding=nn.Embedding(config.vocab_size, config.n_embed) #block_size,vocab_size ->block_size,n_embed
        self.position_embedding=nn.Embedding(config.block_size, config.n_embed) #block_size,n_embed + block_size,n_embed 
        self.blocks=nn.Sequential(
            *[Block(config) for _ in range(config.n_layer)]
        )
        self.norm_final=nn.LayerNorm(config.hidden_dim)
        self.lm_head=nn.Linear(config.hidden_dim, config.vocab_size, bias=False)
        #tie weights减少模型参数
        #linear层 （hidden_dim,vocab_size），实际上shape是（vocab_size,hidden_dim）,所以能够tie weights
        self.token_embedding.weight=self.lm_head.weight

    def __init_weights(self,moudule):
        if isinstance(moudule,nn.Linear):
            torch.nn.init.normal_(moudule.weight, mean=0.0, std=0.02)
            if moudule.bias is not None:
                torch.nn.init.zeros_(moudule.bias)
        elif isinstance(moudule,nn.Embedding):
            torch.nn.init.normal_(moudule.weight, mean=0.0, std=0.02)
    
    def forward(self, idx, targets=None):
        batch_size, seq_len=idx.size()
        token_embed=self.token_embedding(idx)
        #确保位置编码和输入的idx在同一个设备上
        pos_embed=self.position_embedding(
            torch.arange(seq_len, device=idx.device)
        )
        x=token_embed+pos_embed
        x=self.blocks(x)
        x=self.norm_final(x)
        logits=self.lm_head(x)
        if targets is not None:
            batch_size, seq_len, vocab_size=logits.size()
            logits=logits.view(batch_size*seq_len,vocab_size)#相当于一维展开
            targets=targets.view(batch_size*seq_len)
            loss=F.cross_entropy(logits, targets)
            return logits, loss
        else:
            loss=None 

    def generate(self, idx, max_new_tokens):
        # idx shape: (batch_size, seq_len)
        for _ in range(max_new_tokens):
            # 如果idx的长度大于block_size，则取idx的最后block_size个token
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # 获取logits
            logits, _=self(idx_cond)
            # shape: (batch_size, seq_len, vocab_size)
            # 关注最后一个时间步
            logits=logits[:, -1, :] # shape: (batch_size, vocab_size)
            probs=F.softmax(logits, dim=-1)
            # 随机采样
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=-1)
        return idx

    


## 4. 构建 Dataset


In [11]:
class MyDataset(Dataset):
    def __init__(self,path,block_size=512):
        import tiktoken
        self.enc = tiktoken.get_encoding("gpt2")
        self.block_size = block_size

        self.encoded_data = []
        # 特殊符号分割不同的训练文本
        # <|endoftext|> [50256]
        self.eos_token = self.enc.encode(
            "<|endoftext|>",
            allowed_special={"<|endoftext|>"}
        )[0]
        
        raw_data = []
        self.max_lines = 1000
        import json
        with open(path, "r", encoding="utf-8") as f:
            for i,line in enumerate(f):
                if i > self.max_lines:
                    break
                try:
                    # 读取的line是json字符串，需要使用json.loads()转换为字典
                    text = json.loads(line.strip())['text']
                    raw_data.append(text)
                except Exception as e:
                    continue
        
        full_encoded = []
        for text in raw_data:
            encoded_text = self.enc.encode(text) #list 将字符串转换为token id
            full_encoded.extend(encoded_text + [self.eos_token]) 

        # block_size 分块
        for i in range(0, len(full_encoded), self.block_size):
            chunk = full_encoded[i:i+self.block_size+1] #+1 是因为计算损失的时候就不需要移位了
            if len(chunk) < self.block_size+1:
                chunk = chunk + [self.eos_token] * (self.block_size + 1 - len(chunk))
            self.encoded_data.append(chunk)

    def __len__(self):
        return len(self.encoded_data)
    
    def __getitem__(self, idx):
        chunk = self.encoded_data[idx]
        x = torch.tensor(chunk[:-1] , dtype=torch.long)
        y = torch.tensor(chunk[1:] , dtype=torch.long)
        return x,y
    
    def encode(self, text):
        return self.enc.encode(text)
    
    def decode(self, ids):
        return self.enc.decode(ids)
 


## 5. 运行相关函数

In [12]:
train_dataset = MyDataset("/home/xj/everyday_hand/nanogpt/seq-monkey/mobvoi_seq_monkey_general_open_corpus.jsonl")

train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [0.9, 0.1])

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=12, shuffle=False)


In [13]:
model = GPT(GPTConfig())
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

total_params = sum (p.numel() for p in model.parameters())
# for p in model.parameters():
#     print(p.shape)
print(f"Total parameters: {total_params / 1e6} M")

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000)


Total parameters: 124.046592 M


In [15]:
# 训练loop
def train(model, optimizer, scheduler, train_loader, val_loader, device):
    model.train()
    total_loss = 0.0

    for batch_idx, (x,y) in enumerate(train_loader):
        x,y = x.to(device), y.to(device)
        # 前向传播
        logits, loss = model(x,y)
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # 更新学习率
        scheduler.step()

        total_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}")
    return total_loss

def eval(model, val_loader, device):
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for x,y in val_loader:
            x,y = x.to(device), y.to(device)
            logits, loss = model(x,y)
            total_loss += loss.item()

    return total_loss 
       

for epoch in range(2):
    train_loss = train(model, optimizer, scheduler, train_loader, val_loader, device)
    val_loss = eval(model, val_loader, device)
    print(f'Epoch: {epoch}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
    
    # 保存模型
    avg_val_loss = val_loss / len(val_loader)
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'val_loss': avg_val_loss,
    }
    # 保存每个epoch的模型
    torch.save(checkpoint, f'checkpoints/model_epoch_{epoch}.pt')



Epoch 0, Batch 0, Loss: 4.930845737457275
Epoch 0, Batch 100, Loss: 4.943900108337402
Epoch 0, Batch 200, Loss: 5.031148433685303
Epoch: 0, Train Loss: 4.9244, Val Loss: 4.9175
Epoch 1, Batch 0, Loss: 4.880123138427734
Epoch 1, Batch 100, Loss: 4.954814434051514
Epoch 1, Batch 200, Loss: 4.9446587562561035
Epoch: 1, Train Loss: 4.9151, Val Loss: 4.9040
