#### Data

In [1]:
import json
import random

In [2]:
files = ['../Data/OlympiadBench_Dataset/data/outputs.json','../Data/AMC/outputs.json','../Data/MATH/outputs.json']

In [3]:
texts = []
for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        # Load the list from the JSON file
        texts.extend(json.load(f))

In [4]:
from transformers import AutoTokenizer
MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
texts = tokenizer.batch_encode_plus(texts,return_attention_mask=False,add_special_tokens=True,\
                                    truncation=True, max_length=4096)['input_ids']

#### Model

In [6]:
import torch
from torch.nn.utils import clip_grad_value_
import math
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    # AutoConfig,
)
from functions import create_next_model_folder
from peft import (
    get_peft_model,
    PeftType,
    LoraConfig)

In [7]:
epochs = 1
accumulation_steps = 64
verbose = 1024
lr = 6e-5
clip = 6e-3
alpha = 0.05

In [8]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# config = AutoConfig.from_pretrained(MODEL_PATH)
# config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    quantization_config=quantization_config,
    attn_implementation="flash_attention_2"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
def sample_consecutive_chunk(input_list, max_length):
    if len(input_list) <= max_length:
        return input_list
    max_start_index = len(input_list) - max_length
    start_index = random.randint(0, max_start_index)
    out = input_list[start_index:start_index + max_length]
    out[0] = input_list[0] # Start of sentence
    return out

In [10]:
loss_fn = torch.nn.CrossEntropyLoss()
for epoch in range(epochs):
    random.shuffle(texts)
    model.eval()
    train_loss = 0
    train_last = 0
    skip = 0
    tot_skip = 0
    # for llm, batchsize = 1 still gives 100 GPU util
    for i,input_ids in enumerate(texts):
        input_ids = sample_consecutive_chunk(input_ids,1200)
        input_ids = torch.tensor(input_ids).to('cuda')[None]
        with torch.no_grad():
            outs = model(input_ids).logits
            loss = loss_fn(outs[0,:-1],input_ids[0,1:])
        train_loss += loss.item()

        # eval    
        if (i + 1) % verbose == 0:
            print(f"epoch {epoch} iter {i}: train loss {(train_loss-train_last)/(verbose-skip)}")
            train_last = train_loss
            tot_skip += skip
            skip = 0            
    print(f'end of epoch {epoch}, loss: {train_loss/(i-tot_skip)}')

epoch 0 iter 1023: train loss 1.239826615070342
epoch 0 iter 2047: train loss 1.2475226878159447
epoch 0 iter 3071: train loss 1.280505065456964
epoch 0 iter 4095: train loss 1.2600050553301116
epoch 0 iter 5119: train loss 1.212381665056455
epoch 0 iter 6143: train loss 1.2290100864629494
epoch 0 iter 7167: train loss 1.2642828941025073
epoch 0 iter 8191: train loss 1.2603929835167946
epoch 0 iter 9215: train loss 1.260551017941907
epoch 0 iter 10239: train loss 1.2473127340199426
epoch 0 iter 11263: train loss 1.2428824665985303
epoch 0 iter 12287: train loss 1.1943137583730277
epoch 0 iter 13311: train loss 1.2367846277193166
epoch 0 iter 14335: train loss 1.249982453766279
epoch 0 iter 15359: train loss 1.2675882217881735
epoch 0 iter 16383: train loss 1.243489224521909
epoch 0 iter 17407: train loss 1.241614089012728
epoch 0 iter 18431: train loss 1.2548915603838395
epoch 0 iter 19455: train loss 1.2316435200918932
epoch 0 iter 20479: train loss 1.2304505299252924
epoch 0 iter 215

In [11]:
# peft_model_id = "../Model/FT/model2"
# from peft import PeftModel
# from transformers import (
#     AutoModelForCausalLM, 
# )
# # model = ... load base model
# model = PeftModel.from_pretrained(model, peft_model_id)