In [2]:
import os
cachedir = '/scratch/vipul/cache'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TRANSFORMERS_CACHE"] = cachedir
os.environ["HF_DATASETS_CACHE"]= cachedir
import sys
import torch
import transformers
from datasets import load_dataset
from datasets import load_from_disk

from transformers import LlamaTokenizer, AutoConfig
from utils.prompter import Prompter

import random

seed = 10
random.seed(seed)
torch.manual_seed(0)

os.environ["WANDB_DISABLED"] = "true"

In [3]:
base_model: str = "NousResearch/Llama-2-7b-hf"  # the only required argument
data_path: str = "./sampled_data/sampled_scienceqa_train_all.hf" # ../datasets/CL_biology_scienceq_train_all.hf
output_dir: str = "./sampled_scienceqa_256r_8mbs_no8bit_1"
# training hyperparams
batch_size: int = 128
micro_batch_size: int = 8
num_epochs: int = 1
learning_rate: float = 3e-4
cutoff_len: int = 256
val_set_size: int = 2
# lora hyperparams
lora_r: str = "8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8"
lora_alpha: int = 16
lora_dropout: float = 0.05
lora_target_modules: str = "q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
# mola hyperparams
number_experts: str = "2,2,2,2,2,2,2,2,4,4,4,4,4,4,4,4,6,6,6,6,6,6,6,6,8,8,8,8,8,8,8,8"
top_k: str = "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2"
# llm hyperparams
train_on_inputs: bool = True  # if False, masks out inputs in loss
add_eos_token: bool = True
group_by_length: bool = False  # faster, but produces an odd training loss curve
# wandb params
wandb_project: str = ""
wandb_run_name: str = ""
wandb_watch: str = ""  # options: false | gradients | all
wandb_log_model: str = ""  # options: false | true
# resume_from_checkpoint: str = './step2_biology256r_8mbs_no8bit_scale10'  # either training checkpoint or final adapter
prompt_template_name: str = "alpaca"  # The prompt template to use, will default to alpaca.
obalance: bool = False

# Check if parameter passed or if set within environ
use_wandb = len(wandb_project) > 0 or (
        "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
)
# Only overwrite environ if wandb param passed
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project
if len(wandb_watch) > 0:
    os.environ["WANDB_WATCH"] = wandb_watch
if len(wandb_log_model) > 0:
    os.environ["WANDB_LOG_MODEL"] = wandb_log_model

In [4]:
gradient_accumulation_steps = batch_size // micro_batch_size

prompter = Prompter(prompt_template_name)

device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    gradient_accumulation_steps = gradient_accumulation_steps // world_size

In [5]:
from transformers import AutoModelForCausalLM

config = AutoConfig.from_pretrained(base_model)
# config.lora_target_modules = lora_target_modules

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    config=config,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# Modify, if required
# model.get_new_parameters(number_experts, top_k, obalance)

tokenizer = LlamaTokenizer.from_pretrained(base_model)

tokenizer.pad_token_id = (0)  # unk. we want this to be different from the eos token
tokenizer.padding_side = "left"  # Allow batched inference

# model = prepare_model_for_kbit_training(model)


2024-07-10 15:48:06,994 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.52s/it]


In [7]:
from peft import (
    prepare_model_for_kbit_training, # type: ignore
    LoraConfig, # type: ignore
    get_peft_model, # type: ignore
    PeftModel # type: ignore
)
from peft.tuners.lora import LoraLayer

config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
# model = get_peft_model(model, config)

In [8]:
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < cutoff_len
            and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt["labels"][
                                                                user_prompt_len:
                                                                ]  # could be sped up, probably
    return tokenized_full_prompt

In [9]:
if data_path.endswith(".json") or data_path.endswith(".jsonl"):
    data = load_dataset("json", data_files=data_path)
else:
    data = load_from_disk(data_path)
    # data = load_dataset(data_path)

if val_set_size > 0:
    train_val = data["train"].train_test_split(
        test_size=val_set_size, shuffle=True, seed=42
    )
    train_data = (
        train_val["train"].shuffle().map(generate_and_tokenize_prompt)
    )
    val_data = (
        train_val["test"].shuffle().map(generate_and_tokenize_prompt)
    )
else:
    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = None

if not ddp and torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

Map: 100%|██████████| 198/198 [00:00<00:00, 1050.80 examples/s]


Map: 100%|██████████| 2/2 [00:00<00:00, 197.63 examples/s]


In [11]:
from transformers import Trainer

# model.train()
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps" if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=None if val_set_size > 0 else None,
        save_steps=200,
        output_dir=output_dir,
        save_total_limit=3,
        load_best_model_at_end=False,  # True if val_set_size > 0 else False,
        ddp_find_unused_parameters=False if ddp else None,
        group_by_length=group_by_length,
        report_to="wandb" if use_wandb else None,
        run_name=wandb_run_name if use_wandb else None,
        # remove_unused_columns=False,
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)
model.config.use_cache = False


if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

trainer.train()

model.save_pretrained(output_dir)

print(
    "\n If there's a warning about missing keys above, please disregard :)"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [category, attention_mask, subject, skill, topic, input_ids, instruction, input, image, output, choices, lecture, task, question, grade, labels, hint, solution, answer]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.