# Toy example: TRTF_encoder

In [1]:
from models.transformers import torch_Transformer
from src.utils import model_utils

import torch



vocab_size = 10000
d_model = 512  
n_head = 8
en_layers = 6
d_ff = 2048
max_seq_length = 600


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


tr_tf_en = torch_Transformer.TRTF_encoder(vocab_size, d_model, 
    n_head, en_layers, d_ff, max_seq_length)

tr_tf_en = tr_tf_en.to(device)
tr_tf_en = tr_tf_en.bfloat16()



print(sum(p.numel() for p in tr_tf_en.parameters())/1e9, 'B parameters in tr_tf_en')




src = torch.randint(0, vocab_size, (10, 32))
src_mask = model_utils.square_subsequent_mask(10)


src = src.to(device)

src_mask = src_mask.to(device)
src_mask = src_mask.bfloat16()


output = tr_tf_en.forward(src,src_mask)
print("TRTF_en output shape",output.shape)



  from .autonotebook import tqdm as notebook_tqdm


cuda
0.024341504 B parameters in tr_tf_en
TRTF_en output shape torch.Size([10, 32, 512])


# PTF_TRTF_encoder Pretraining

## 1. h4ca20k_py pretraining

In [95]:
#import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [96]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [97]:
import torch 
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset

In [98]:
h4ca20k_py = load_dataset("graycatHCO3/CodeAlpaca-20K-Python")
#h4ca20k_py = load_dataset("./data/codeNLU/h4ca20k_py/h4ca20k_py_.parquet")

In [99]:
print(h4ca20k_py)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 4777
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 548
    })
})


In [100]:
from models.transformers import torch_Transformer
from src.utils import model_utils





#vocab_size = 1000
d_model = 512  
n_head = 8
en_layers = 6
d_ff = 512
max_seq_length = 512
out_dim = 64






In [101]:
criterion = nn.CrossEntropyLoss()


### 1) By BERT tokenizer

In [102]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
vocab_size = tokenizer.vocab_size

def tokenize_function(examples):
    # 确保输出为Tensor
    input_ids = tokenizer(examples['completion'], truncation=True, padding="max_length", max_length=512, return_tensors="pt").input_ids
    labels = tokenizer(examples['prompt'], truncation=True, padding="max_length", max_length=128, return_tensors="pt").input_ids
    return {'input_ids': input_ids, 'labels': labels}



tokenized_datasets = h4ca20k_py.map(tokenize_function, batched=False)



In [103]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'labels'],
        num_rows: 4777
    })
    test: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'labels'],
        num_rows: 548
    })
})


In [104]:
ptf_trtf_en = torch_Transformer.paddingT_FTRTF_encoder(vocab_size, d_model, 
    n_head, en_layers, d_ff, out_dim, max_seq_length)

ptf_trtf_en = ptf_trtf_en.to(device)
#ptf_trtf_en = ptf_trtf_en.bfloat16()
#ptf_trtf_en.eval()
optimizer = AdamW(ptf_trtf_en.parameters(), lr = 5e-5)

print(sum(p.numel() for p in ptf_trtf_en.parameters())/1e9, 'B parameters in ptf_trtf_en')



0.041015098 B parameters in ptf_trtf_en


In [105]:
def collate_fn(batch):
    # 使用torch.stack来组合Tensor，确保所有item已正确转换为Tensor
    input_ids = torch.stack([torch.tensor(item['input_ids']) for item in batch])
    labels = torch.stack([torch.tensor(item['labels']) for item in batch])
    return {'input_ids': input_ids, 'labels': labels}




In [106]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=8, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8, shuffle=True, collate_fn=collate_fn)

In [107]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        if isinstance(batch['input_ids'], list):
            batch['input_ids'] = torch.tensor(batch['input_ids'])
            batch['labels'] = torch.tensor(batch['labels'])
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        optimizer.zero_grad()
        #print(inputs.size())
        #print(f"src shape before processing: {inputs.squeeze(1).shape}")  # 打印src的形状
        outputs = model(inputs.squeeze(1))

        loss = 0
        for i in range(outputs.shape[1]):
            loss += F.cross_entropy(outputs[:, i, :], labels[:, i])
        loss /= outputs.shape[1]  # 平均损失

        #loss = criterion(outputs, labels.squeeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


In [108]:
def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [109]:
num_epochs =10
best_val_loss = float('inf')


In [110]:
for epoch in range(num_epochs):
    train_loss = train(ptf_trtf_en, train_dataloader, optimizer, criterion, device)
    val_loss = validate(ptf_trtf_en, val_dataloader, criterion, device)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss}, Validation Loss: {val_loss}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(ptf_trtf_en.state_dict(),'ptf_trtf_en.pth')

src shape before processing: torch.Size([8, 512])


OutOfMemoryError: CUDA out of memory. Tried to allocate 478.00 MiB (GPU 0; 4.00 GiB total capacity; 3.27 GiB already allocated; 0 bytes free; 3.34 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
import sentencepiece as spm

## 2. RL_env pretrianing


In [4]:
import json
from transformers import BertTokenizer
from src.utils import data_utils

with open('./data/RL_env/Pettingzoo_mpe/simple_adversary.json','r') as file:
    data = json.load(file)

formed_data = data_utils.json2Tpath(data)


In [None]:
py_codes = dataset.filter(lambda example: example['langu'])

In [4]:
tokenized_data = []
for path, text in formed_data:
    tokens = tokenizer.encode_plus(text, add_special_tokens=True, max_length=512, truncation=True, padding="max_length", return_tensors="pt").to(device)
    tokenized_data.append((path,tokens))

In [5]:
from models.transformers import torch_Transformer
from src.utils import model_utils





vocab_size = 1000
d_model = 256  
n_head = 8
en_layers = 6
d_ff = 256
max_seq_length = 512
out_dim = 64



ptf_trtf_en = torch_Transformer.paddingT_FTRTF_encoder(vocab_size, d_model, 
    n_head, en_layers, d_ff, out_dim, max_seq_length)

ptf_trtf_en = ptf_trtf_en.to(device)
ptf_trtf_en = ptf_trtf_en.bfloat16()
ptf_trtf_en.eval()


print(sum(p.numel() for p in ptf_trtf_en.parameters())/1e9, 'B parameters in tr_tf_en')





0.002778176 B parameters in tr_tf_en


In [2]:
encoded_data = []


for path, tokens in tokenized_data:
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask'].to(dtype=torch.float32)
    with torch.no_grad():
        output = ptf_trtf_en(input_ids, attention_mask)
    encoded_data.append((path, output))

NameError: name 'tokenized_data' is not defined