# GPT-2 Model


In [None]:
import sys
import os


try:
    get_ipython
    current_dir = os.getcwd()
except NameError:
    current_dir = os.path.dirname(os.path.abspath(__file__))
# Set path，temporary path expansion
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
import torch
import torch.nn as nn
from torch.testing import assert_close
torch.manual_seed(42)
import tool, loaddata ,model_wrapper,models
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import tiktoken

### Config

In [None]:

IS_SKIP_TEST =True #开关测试代码

IS_EN =True # 中英文数据集切换

IS_TRAIN=True # 预训练开关


GPT_CONFIG = {
    "num_epochs":4,
    "batch_size":4,
    "vocab_size": 50257,     
    "context_len": 256,  
    "emb_dim": 768,          
    "n_heads": 8,          
    "n_layers": 12,         
    "drop_rate": 0.1,      
    "initializer_range":0.02,  
    "qkv_bias": False ,     
}

TOKEN_TYPE="gpt2"
# TOKEN_TYPE="cl100k_base"

LR= 1e-3
WEIGHT_DECAY =0.1

#评估频率
EVAL_FREQ = 5
EVAL_ITER = 5


In [None]:
! nvidia-smi

### Set device to (type='cuda')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

### Tensorboard Log

In [None]:
# 创建一个日志写入器
import time
starttime = time.strftime("%Y-%m-%d_%H-%M-%S")
print("Start experiment:", starttime)
logpath ="../log/"
log_writer = SummaryWriter(log_dir=logpath+starttime[:16],comment=starttime[:16],flush_secs=60)#以实验时间命名


## Load Data

### Select which language of the text to load

In [None]:
if IS_EN :
    file_path ="../datasets/the-verdict.txt"
    train_data, valid_data =loaddata.load_data_en(file_path)
else:
    train_data, valid_data = loaddata.load_data_cn(True,0.8)
    
len(train_data)

## GPTDataLoader

In [None]:
@tool.train_execution(IS_TRAIN)
def Dataloadering():
    train_loader = model_wrapper.GPTDataloader(
        train_data,
        TOKEN_TYPE,
        batch_size = GPT_CONFIG['batch_size'],
        max_len = GPT_CONFIG["context_len"],
        stride = GPT_CONFIG["context_len"] // 2, #适度重叠（stride = max_len // 2）
        drop_last=True,
        shuffle= True, #训练时打乱，验证 / 测试时不打乱：训练时打乱是为了提升泛化能力
        num_works=0   
        )

    print(F'共{len(train_loader)}个批次，'
        f'每批{train_loader.batch_size}个样本，'
        f'每个样本是长度为 {train_loader.dataset.max_len} 的 token 序列')


    print("Train loader:")
    if len(train_loader)<10: #只在测试的时候查看一下数据，数据过大，耗时很长
        x, y =next(iter(train_loader))
        print(x.shape, y.shape)


    valid_loader = model_wrapper.GPTDataloader(
        valid_data,
        TOKEN_TYPE,
        batch_size = GPT_CONFIG['batch_size'],
        max_len = GPT_CONFIG["context_len"],
        stride = GPT_CONFIG["context_len"] ,
        drop_last=False, # 验证 / 测试阶段：需要完整评估所有样本的性能，不能遗漏任何数据点
        shuffle= False, #验证 / 测试时不打乱是为了结果可复现，方便对比不同模型的性能
        num_works=0
        )

    print("Validation loader:")
    if len(valid_loader)<10:
        x, y =next(iter(valid_loader))
        print(x.shape, y.shape)
        
    return train_loader,valid_loader

## Training model
### Training process function

In [None]:

def train_model_process(model,train_loader,valid_loader,
                        optimizer,device,num_epochs,
                        eval_freq,eval_iter,
                        start_context,tokenizer,checkpoint_path=None):
       # 初始化训练状态（如果有 checkpoint 则加载）
    if checkpoint_path and os.path.exists(checkpoint_path):
        print(f"Resuming from checkpoint: {checkpoint_path}")
        state = model_wrapper.load_checkpoint(model, optimizer, checkpoint_path)
        start_epoch = state['epoch']
        global_step = state['global_step']
        train_losses = state['train_losses']
        val_losses = state['val_losses']
        track_tokens_seen = state['track_tokens_seen']
        tokens_seen = track_tokens_seen[-1] if track_tokens_seen else 0
    else:
        # 从头开始训练
        start_epoch = 0
        global_step = -1
        train_losses, val_losses = [], []
        track_tokens_seen = []
        tokens_seen = 0


    for epoch in tqdm(range(start_epoch, num_epochs), desc="training"):
        model.train() #开启训练 启用dropout、启用Batchnorm
        freq =100 
        freq =100 if freq<len(train_loader) else  len(train_loader) 
        print (freq)
        for input_batch,target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            optimizer.zero_grad()
            loss = model_wrapper.calc_loss_batch(input_batch,target_batch,model,device) 
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step +=1
            
           
            if global_step % eval_freq == 0:
                train_loss, val_loss = model_wrapper.evaluate_model(model,train_loader,valid_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                log_writer.add_scalar('Training/Train_Loss', train_loss, global_step)
                log_writer.add_scalar('Training/Val_Loss', val_loss, global_step)
                print(f'EP: {epoch+1} STEP: {global_step} '
                    f'{f"T_LOSS: {train_loss:.3f}" if train_loss is not None else "T_LOSS: None "} '
                    f'{f"V_LOSS: {val_loss:.3f}" if val_loss is not None else "V_LOSS: None"}'
                    )
                model_wrapper.generate_and_print(model,tokenizer,device,start_context,20)
            
            # 定期保存 checkpoint
            if global_step % freq == 0 and global_step != 0:
                model_wrapper.save_checkpoint(
                    model, optimizer, epoch, global_step,
                    train_losses, val_losses, track_tokens_seen,
                    f"../model/checkpoints/epoch_{epoch}_step_{global_step}.pt"
                )
    return train_losses,val_losses,track_tokens_seen

In [None]:
# tesorboard 调用代码
# ! tensorboard --logdir=../log  --port=6006 --host=0.0.0.0  
# ssh -L 8888:localhost:6006 zzz@172.23.207.112    

In [None]:
modelpath ='../model/gpt2.pt'
start_context = "I turned to Mrs"
tokenizer = tiktoken.get_encoding(TOKEN_TYPE)


@tool.train_execution(IS_TRAIN)
def training(savemodel=False):
    model = models.GPTModel(GPT_CONFIG)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
    train_loader,valid_loader = Dataloadering()
    result=train_model_process(model,train_loader,valid_loader,
                                                                optimizer,device,num_epochs=GPT_CONFIG['num_epochs'],
                                                                eval_freq=EVAL_FREQ,eval_iter=EVAL_ITER,
                                                                start_context=start_context,
                                                                tokenizer=tokenizer,
                                                                checkpoint_path='../model/checkpoints/0.pt'
                                                                )
    if savemodel:
        model_wrapper.savemodel(modelpath,model,optimizer,GPT_CONFIG)  
        
    return result

training(True)



In [None]:
log_writer.close()

## Load the parameters of the model

Load the model parameters saved for model checking

In [None]:
checkpoint = torch.load(modelpath, map_location=device)
config = checkpoint["config"]
print(config==GPT_CONFIG)
model = models.GPTModel(config)
for name, param in checkpoint["model_state_dict"].items():
           print(f"{name}: {param.shape}")
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.to(device)


### Test the generation capability of the saved model

In [None]:
for _ in range(5):
    model_wrapper.generate_and_print(model,tokenizer,device,start_context,100,0.8,50,1,None)