In [1]:
import json
import torch
import pandas as pd

from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_int8_training
from contextlib import nullcontext
from transformers import (default_data_collator, Trainer, TrainingArguments,
                          TrainerCallback, AutoTokenizer, AutoModelForCausalLM,
                          BitsAndBytesConfig)

from transformers.integrations import WandbCallback

from utils.dataset import CombineDataset, template

# setting up wandb
%env WANDB_PROJECT=disco-limbic-dialogue

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/xx/miniconda3/envs/disco/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


# Params

In [16]:
# model
model_id = 'GeneZC/MiniChat-3B'

# data settings
dataset_train_path = 'data/dataset/v1/train.json'
dataset_test_path = 'data/dataset/v1/test.json'
max_data_length = 256*2

# lora settings
lora_r = 32
lora_alpha = 16
lora_dropout = 0.05
target_modules = ["q_proj", "v_proj"]
    

# train settings 
device = 'cuda'
lr = 3e-4
num_train_epochs = 5

gradient_accumulation_steps = 16
per_device_train_bs = 2
per_device_eval_bs = 4

log_steps = 10
eval_steps = 30


output_dir = f'lora/disco-limbic-dialogue-phi2'

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True,
                                             device_map=device,
                                             quantization_config=bnb_config,
                                             )
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# load data

In [6]:
with open(dataset_train_path, 'r') as f:
    raw_data = json.load(f)

with open(dataset_test_path, 'r') as f:
    raw_data_test = json.load(f)

## filter
data_train = []
for i in raw_data:
    if len(tokenizer(template(i))['input_ids']) < max_data_length:
        data_train.append(i)
print(len(raw_data), len(data_train))

data_test = []
for i in raw_data_test:
    if len(tokenizer(template(i))['input_ids']) < max_data_length:
        data_test.append(i)
print(len(raw_data_test), len(data_test))


train_dataset = CombineDataset(data_train, tokenizer, max_words=max_data_length)
test_dataset = CombineDataset(data_test, tokenizer, max_words=max_data_length)

4878 1344


"data_test = []\nfor i in raw_data_test:\n    if len(tokenizer(template(i))['input_ids']) < max_data_length:\n        data_test.append(i)\nprint(len(raw_data_test), len(data_test))"

In [8]:
model.train()

def create_peft_config(model):
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules = target_modules
    )

    model = prepare_model_for_int8_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)

trainable params: 294,912 || all params: 3,020,743,680 || trainable%: 0.009762893884462253




In [9]:
enable_profiler = False
config = {
    'lora_config': lora_config,
    'learning_rate': lr,
    'num_train_epochs': num_train_epochs,
    'gradient_accumulation_steps': gradient_accumulation_steps,
    'per_device_train_batch_size': per_device_train_bs,
    'gradient_checkpointing': False,
}

profiler = nullcontext()


In [10]:
def decode_predictions(tokenizer, predictions):
    prediction_text = tokenizer.batch_decode(predictions.predictions.argmax(axis=-1))
    return {"predictions": prediction_text} # "labels": labels, 


class WandbPredictionProgressCallback(WandbCallback):
    def __init__(self, trainer, tokenizer, val_dataset, num_samples=100, freq=2):
        super().__init__()
        self.trainer = trainer
        self.tokenizer = tokenizer
        self.sample_dataset = [val_dataset[i] for i in range(num_samples)]
        self.freq = freq

    def on_evaluate(self, args, state, control, **kwargs):
        super().on_evaluate(args, state, control, **kwargs)
        if state.global_step % self.freq == 0:
            predictions = self.trainer.predict(self.sample_dataset)
            predictions = decode_predictions(self.tokenizer, predictions)
            predictions_df = pd.DataFrame(predictions)
            predictions_df["epoch"] = state.epoch
            records_table = self._wandb.Table(dataframe=predictions_df)
            self._wandb.log({"sample_predictions": records_table})

In [None]:
print(output_dir)
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    #bf16=True,  # Use BF16 if available
    ## eval strat
    do_eval=True,
    evaluation_strategy='steps',
    eval_steps=eval_steps,
    per_device_eval_batch_size=per_device_eval_bs,
    ## logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=log_steps,
    ## wandb
    report_to="wandb",
    run_name=output_dir.split('/')[-1],
    ## other
    save_strategy="no",
    #optim="adamw_torch_fused",
    max_steps=-1,
    **{k:v for k,v in config.items() if k != 'lora_config'}
)

with profiler:
    # Create Trainer instance``
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=default_data_collator,
        callbacks=[],
    )
    # Instantiate the WandbPredictionProgressCallback
    progress_callback = WandbPredictionProgressCallback(
        trainer=trainer,
        tokenizer=tokenizer,
        val_dataset=test_dataset,
        num_samples=16,
        freq=30,
    )

    # Add the callback to the trainer
    trainer.add_callback(progress_callback)
    
    trainer.train()

In [12]:
model.save_pretrained(output_dir)