## Environment Setting

In [None]:
import os
# Set CUDA_VISIBLE_DEVICES to 0 to make only the first GPU visible
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
import condacolab
condacolab.check()

In [None]:
import os

!rm -rf zo2/
!git clone https://github.com/liangyuwang/zo2.git
print("Current working directory:", os.getcwd())
os.chdir('zo2/')
print("New working directory:", os.getcwd())

!conda env update -n base -f env.yml

## Using [MeZO Runner](../example/mezo_runner/) on Supported Tasks

In [None]:
import os

print("Current working directory:", os.getcwd())
os.chdir('./example/mezo_runner/')
print("New working directory:", os.getcwd())

!MODEL=facebook/opt-2.7b TASK=SST2 MODE=ft LR=1e-7 EPS=1e-3 STEPS=20000 EVAL_STEPS=4000 bash mezo.sh

os.chdir('../../tutorial/')
print("New working directory:", os.getcwd())

## Using Huggingface Trainer

In [None]:
import sys
sys.path.append("../")

from tqdm.auto import tqdm
import torch
from transformers import (
    AutoTokenizer, 
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from zo2 import (
    ZOConfig,
    zo_hf_init,
)
from zo2.trainer.hf_transformers.trainer import ZOTrainer
from zo2.trainer.hf_trl.sft_trainer import ZOSFTTrainer
from zo2.utils import seed_everything

In [None]:
# Hyperparameter
zo_method = "zo2"
eval_mode = False
model_name = "facebook/opt-2.7b"
verbose = True
max_steps = 300
learning_rate = 1e-7
weight_decay = 1e-1
zo_eps = 1e-3
seed = 42
offloading_device = "cpu"
working_device = "cuda:0"
max_train_data = None
max_eval_data = None
use_cache = True
max_new_tokens = 50
temperature = 1.0
seed_everything(seed)

In [None]:
# ZO steps
zo_config = ZOConfig(
    method="mezo-sgd", 
    zo2=zo_method=="zo2", 
    lr=learning_rate,
    weight_decay=weight_decay,
    eps=zo_eps,
    offloading_device=offloading_device,
    working_device=working_device,
)

# Load ZO model
with zo_hf_init(zo_config):
    from transformers import OPTForCausalLM
    model = OPTForCausalLM.from_pretrained(model_name)
    model.zo_init(zo_config)
if zo_method != "zo2": 
    model = model.to(working_device)
print(f"Check if zo2 init correctly: {hasattr(model, 'zo_training')}")

In [None]:
# Prepare dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

# tokenizing dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
block_size = tokenizer.model_max_length
def tokenize_function(examples):
    return tokenizer(examples["text"])
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
# trainer init
training_args = TrainingArguments(
    "test-trainer", 
    max_steps=max_steps,
    save_strategy="no", 
    logging_steps=10,
)

trainer = ZOTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# 'ZOTrainer' provides the capability to register pre-hooks and post-hooks during zo_step
def drop_invalid_data(model, inputs, loss):
    # Extract projected_grad, handle both tensor and scalar cases
    projected_grad = model.opt.projected_grad
    if isinstance(projected_grad, torch.Tensor):
        projected_grad_is_nan = torch.isnan(projected_grad).any()
    else:
        projected_grad_is_nan = projected_grad != projected_grad  # Check for NaN in scalars
    if torch.isnan(loss) or projected_grad_is_nan:
        tqdm.write("'loss': {} or 'projected_grad': {} is nan. Drop this step.".format(
            loss, model.opt.projected_grad
        ))
        model.opt.projected_grad = 0  # Reset projected_grad to prevent parameter updates
    return model, inputs, loss
trainer.register_zo2_training_step_post_hook(drop_invalid_data)

In [None]:
# trainer step
trainer.train()