#### Data

In [1]:
import json
import random

In [2]:
files = ['../Data/OlympiadBench_Dataset/data/outputs.json','../Data/AMC/outputs.json','../Data/MATH/outputs.json']

In [3]:
texts = []
for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        # Load the list from the JSON file
        texts.extend(json.load(f))

In [1]:
from transformers import AutoTokenizer
MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
texts = tokenizer.batch_encode_plus(texts,return_attention_mask=False,add_special_tokens=True,\
                                    truncation=True, max_length=4096)['input_ids']

#### Model

In [2]:
import torch
from torch.nn.utils import clip_grad_value_
import math
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    # AutoConfig,
)
from peft import (
    get_peft_model,
    PeftType,
    LoraConfig)

In [7]:
epochs = 1
accumulation_steps = 64
verbose = 1024
lr = 6e-5
clip = 6e-3
alpha = 0.05

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# config = AutoConfig.from_pretrained(MODEL_PATH)
# config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    quantization_config=quantization_config,
    attn_implementation="flash_attention_2"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
peft_config = LoraConfig(r=8, # low rank 
                         lora_alpha = 16, # see below 
                         lora_dropout = 0.1, 
                         bias="none",#'none', 'all' or 'lora_only' 
                         target_modules = [ "q_proj", 
                                            "k_proj", 
                                            "v_proj", 
                                            "o_proj", 
                                            "gate_proj", 
                                            "up_proj", 
                                            "down_proj" 
                                        ] 
                        )
model = get_peft_model(model, peft_config)
model.config.pad_token_id = tokenizer.pad_token_id
model.print_trainable_parameters()

trainable params: 18,739,200 || all params: 6,929,104,896 || trainable%: 0.2704418576606868


In [10]:
trainable_params = [param for param in model.parameters() if param.requires_grad]
optimizer = torch.optim.AdamW(trainable_params,lr = lr)
# optimizer = torch.optim.SGD(trainable_params,lr=lr)

In [11]:
def sample_consecutive_chunk(input_list, max_length):
    if len(input_list) <= max_length:
        return input_list
    max_start_index = len(input_list) - max_length
    start_index = random.randint(0, max_start_index)
    out = input_list[start_index:start_index + max_length]
    out[0] = input_list[0] # Start of sentence
    return out

In [12]:
loss_fn = torch.nn.CrossEntropyLoss()
for epoch in range(epochs):
    random.shuffle(texts)
    model.train()
    train_loss = 0
    train_last = 0
    skip = 0
    tot_skip = 0
    # for llm, batchsize = 1 still gives 100 GPU util
    for i,input_ids in enumerate(texts):
        # train
        input_ids = sample_consecutive_chunk(input_ids,1200)
        input_ids = torch.tensor(input_ids).to('cuda')[None]
        outs = model(input_ids).logits
        if torch.any(torch.isnan(outs)):
            skip += 1
            continue
        loss = loss_fn(outs[0,:-1],input_ids[0,1:])
        if math.isinf(loss.item()) or math.isnan(loss.item()):
            skip += 1
            continue

        loss.backward()
        train_loss += loss.item()
        # print(i,train_loss)
        if (i + 1) % accumulation_steps == 0:
            clip_grad_value_(trainable_params,clip)
            optimizer.step()
            optimizer.zero_grad()

        # eval    
        if (i + 1) % verbose == 0:
            print(f"epoch {epoch} iter {i}: train loss {(train_loss-train_last)/(verbose-skip)}")
            train_last = train_loss
            tot_skip += skip
            skip = 0            
    print(f'end of epoch {epoch}, loss: {train_loss/(i-tot_skip)}')

epoch 0 iter 1023: train loss 1.2246937110321596
epoch 0 iter 2047: train loss 0.9911843656300334
epoch 0 iter 3071: train loss 0.896547615135205
epoch 0 iter 4095: train loss 0.8517464611795731
epoch 0 iter 5119: train loss 0.8255010330030927
epoch 0 iter 6143: train loss 0.810331098997267
epoch 0 iter 7167: train loss 0.8075518999467022
epoch 0 iter 8191: train loss 0.7850602741236798
epoch 0 iter 9215: train loss 0.7778296660544584
epoch 0 iter 10239: train loss 0.7870377459112206
epoch 0 iter 11263: train loss 0.7926048189619905
epoch 0 iter 12287: train loss 0.781508650710748
epoch 0 iter 13311: train loss 0.7560314634429233
epoch 0 iter 14335: train loss 0.7650179842312355
epoch 0 iter 15359: train loss 0.7638273898228363
epoch 0 iter 16383: train loss 0.7660701704662642
epoch 0 iter 17407: train loss 0.7554997012521198
epoch 0 iter 18431: train loss 0.7547400112307514
epoch 0 iter 19455: train loss 0.7511080079639214
epoch 0 iter 20479: train loss 0.7509563310086378
epoch 0 iter

In [15]:
peft_model_id = "../Model/lora"
model.save_pretrained(peft_model_id)

In [4]:
# peft_model_id = "../Model/lora"
# from peft import PeftModel
# from transformers import (
#     AutoModelForCausalLM, 
# )
# # model = ... load base model
# model = PeftModel.from_pretrained(model, peft_model_id)