In [1]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
hf_token = "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"

In [3]:
from huggingface_hub import login
login(token=hf_token)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/joeygoh722/.cache/huggingface/token
Login successful


In [4]:
#model_id = "google/gemma-7b-it"
# model_id = "google/gemma-7b"
model_id = "google/gemma-2b-it"
# model_id = "google/gemma-2b"

base_model = "google/gemma-2b-it"
dataset_name = "yatharth97/10k_reports_gemma"
new_model = "yatharth-gemma-2b-it-10k"
model = AutoModelForCausalLM.from_pretrained(model_id, token = hf_token, attn_implementation="flash_attention_2", 
                                             quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, token = hf_token, add_eos_token=True)

Downloading shards: 100%|██████████| 2/2 [00:00<00:00,  2.50it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:28<00:00, 14.00s/it]


In [5]:
from datasets import load_dataset
train_dataset = load_dataset(dataset_name, split="train")
test_dataset = load_dataset(dataset_name, split="train")

In [6]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  <end_of_turn>\n<start_of_turn>model


  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

In [7]:
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    text = f"""
    <start_of_turn>user {data_point['system_prompt']}  {data_point['question']} <end_of_turn>\n
    <start_of_turn>model {data_point['response']} <end_of_turn>
    """
    return text

In [8]:
train_text_column = [generate_prompt(data_point) for data_point in train_dataset]
test_text_column = [generate_prompt(data_point) for data_point in test_dataset]

train_dataset = train_dataset.add_column("prompt", train_text_column)
test_dataset = test_dataset.add_column("prompt", test_text_column)

In [9]:
train_dataset = train_dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map: 100%|██████████| 6768/6768 [00:00<00:00, 7254.48 examples/s]


In [10]:
test_dataset

Dataset({
    features: ['conv_id', 'step', 'system_prompt', 'question', 'response', 'prompt'],
    num_rows: 6768
})

In [11]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
print(model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaFlashAttention2(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
   

In [13]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [14]:
modules = find_all_linear_names(model)
print(modules)

['down_proj', 'gate_proj', 'o_proj', 'up_proj', 'v_proj', 'k_proj', 'q_proj']


In [15]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [16]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


In [17]:
wandb_api_key = 'c28da77c10d67a877316fd356f5c19f07224c5d2'

In [19]:
!pip install wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [18]:
import wandb

In [19]:
wandb.login(key = wandb_api_key)
run = wandb.init(
    project='Fine tuning Gemma 2B - 10K Reports',
    job_type="training",
    anonymous="allow"
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mysant77[0m ([33myatharth97[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/joeygoh722/.netrc


In [20]:
#new code using SFTTrainer
import transformers

from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        num_train_epochs=1,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,
        warmup_steps=0.03,
        max_steps=-1,
        learning_rate=2e-4,
        logging_steps=1,
        logging_strategy="steps",
        output_dir="outputs",
        hub_token=hf_token,
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        report_to="wandb"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

Map: 100%|██████████| 6768/6768 [00:00<00:00, 8855.09 examples/s]
Map: 100%|██████████| 6768/6768 [00:00<00:00, 9288.07 examples/s]


In [21]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


Step,Training Loss
1,6.5583
2,6.6665
3,4.7535
4,3.448
5,2.6802
6,2.2573
7,1.7023
8,1.415
9,1.1378
10,0.9513


TrainOutput(global_step=105, training_loss=0.4358186348563149, metrics={'train_runtime': 1964.2492, 'train_samples_per_second': 3.446, 'train_steps_per_second': 0.053, 'total_flos': 1.0804672493617152e+16, 'train_loss': 0.4358186348563149, 'epoch': 0.99})

In [22]:
trainer.model.save_pretrained(new_model, token=hf_token)

In [None]:
trainer

In [24]:
import torch
torch.cuda.empty_cache()
import gc
#del trainer
#del model
gc.collect()
gc.collect()

0

In [23]:
del train_dataset

In [25]:
del trainer

In [26]:
del model

In [27]:
gc.collect()

27423

In [28]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.88s/it]


In [29]:
# Push the model and tokenizer to the Hugging Face Model Hub
merged_model.push_to_hub(new_model, use_temp_dir=False, token="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")
tokenizer.push_to_hub(new_model, use_temp_dir=False,token="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")


model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]
model-00002-of-00002.safetensors:   0%|          | 16.4k/67.1M [00:00<10:39, 105kB/s]
model-00002-of-00002.safetensors:   2%|▏         | 1.49M/67.1M [00:00<00:12, 5.21MB/s]
model-00002-of-00002.safetensors:   3%|▎         | 2.00M/67.1M [00:00<00:15, 4.11MB/s]
[A
model-00002-of-00002.safetensors:   4%|▎         | 2.41M/67.1M [00:00<00:31, 2.06MB/s]
model-00002-of-00002.safetensors:   7%|▋         | 4.41M/67.1M [00:01<00:12, 4.93MB/s]
model-00002-of-00002.safetensors:   9%|▉         | 6.21M/67.1M [00:01<00:08, 6.82MB/s]
model-00002-of-00002.safetensors:  12%|█▏        | 7.83M/67.1M [00:01<00:07, 8.18MB/s]
model-00002-of-00002.safetensors:  21%|██        | 14.1M/67.1M [00:01<00:02, 19.6MB/s]
model-00002-of-00002.safetensors:  34%|███▍      | 22.9M/67.1M [00:02<00:03, 13.2MB/s]
[A
model-00002-of-00002.safetensors:  47%|████▋     | 31.6M/67.1M [00:03<00:02, 12.0MB/s]
[A
model-00002-of-00002.safetensors:  61%|█

CommitInfo(commit_url='https://huggingface.co/yatharth97/yatharth-gemma-2b-it-10k/commit/f3066cd47400734aa5566e987ebb82644f2fde93', commit_message='Upload tokenizer', commit_description='', oid='f3066cd47400734aa5566e987ebb82644f2fde93', pr_url=None, pr_revision=None, pr_num=None)