In [1]:
!pip install -q datasets trl peft bitsandbytes sentencepiece wandb

[0m

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import os
import torch

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from trl import DPOTrainer, DPOConfig
import bitsandbytes as bnb
import wandb

hf_token = os.getenv('HF_TOKEN')
wb_token = os.getenv('WB_TOKEN')
wandb.login(key=wb_token)

model_name = "yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties"
new_model = "Llama3-8B-SuperNova-Spectrum-Hermes-DPO"

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
def chatml_format(example):
    # Format system
    system = ""

    # Format instruction
    message = {"role": "user", "content": example['question']}
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

    # Format chosen answer
    chosen = example['chosen'] + "<|end_of_text|>\n"

    # Format rejected answer
    rejected = example['rejected'] + "<|end_of_text|>\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

# Load dataset
dataset = load_dataset("yuvraj17/chatml-OpenHermes2.5-dpo-binarized-alpha-2k")['train']

# Save columns
original_columns = dataset.column_names

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Format dataset
dataset = dataset.map(
    chatml_format,
    remove_columns=original_columns
)

# Print sample
dataset[1]

README.md:   0%|          | 0.00/391 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'chosen': "const fs = require('fs');\nconst path = require('path');\nconst zlib = require('zlib');\n\n// Parse command line arguments\nconst args = process.argv.slice(2);\nif (args.length < 2) {\n  console.error('Usage: node monitor.js [target_directory] [output_directory]');\n  process.exit(1);\n}\n\nconst targetDirectory = args[0];\nconst outputDirectory = args[1];\n\nfunction compressFile(filePath, outputPath) {\n  const gzip = zlib.createGzip();\n  const inputStream = fs.createReadStream(filePath);\n  const outputStream = fs.createWriteStream(outputPath);\n\n  inputStream.pipe(gzip).pipe(outputStream);\n}\n\nfunction watchDirectory(directory) {\n  fs.readdir(directory, { withFileTypes: true }, (err, files) => {\n    if (err) {\n      console.error(`Error reading directory ${directory}:`, err);\n      return;\n    }\n\n    files.forEach((file) => {\n      const fullPath = path.join(directory, file.name);\n      if (file.isDirectory()) {\n        // Recursively watch subdirectories\

# Training the model with DPO

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    load_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Model to perform DPO on
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

ref_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

Unused kwargs: ['load_4bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# LoRA configuration
peft_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# Training arguments
training_args = DPOConfig(
    beta=0.1,
    num_train_epochs=1.0,
    output_dir="outputs/Llama3-8B-SuperNova-Spectrum-Hermes-DPO",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    save_strategy="no",
    logging_steps=1,
    optim="paged_adamw_32bit",
    warmup_steps=100,
    remove_unused_columns=False,
    learning_rate=5e-05,
    weight_decay=0.1,
    report_to="wandb",
    run_name="Llama3-8B-SuperNova-Spectrum-Hermes-DPO"
)

dpo_trainer = DPOTrainer(
    model,
    ref_model=None, # No need to pass ref, since we have used peft
    args=training_args,
    peft_config=peft_config,
    train_dataset=dataset,
    tokenizer=tokenizer
)



Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
# Fine-tune model with DPO
dpo_trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33myuvraj117[0m ([33mmy-sft-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111252676281664, max=1.0)…

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6931
2,0.6931
3,0.693
4,0.6927
5,0.6924
6,0.6913
7,0.6905
8,0.6891
9,0.6893
10,0.6843


TrainOutput(global_step=62, training_loss=0.5373980085695943, metrics={'train_runtime': 7135.0774, 'train_samples_per_second': 0.28, 'train_steps_per_second': 0.009, 'total_flos': 0.0, 'train_loss': 0.5373980085695943, 'epoch': 0.992})

In [8]:
# Save artifacts
dpo_trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")

('final_checkpoint/tokenizer_config.json',
 'final_checkpoint/special_tokens_map.json',
 'final_checkpoint/tokenizer.json')

In [None]:
# Flush memory
del dpo_trainer, model, ref_model
torch.cuda.empty_cache()

In [11]:
# Reload model in BF16 (instead of NF4)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Merge base model with the adapter
model = PeftModel.from_pretrained(base_model, "final_checkpoint")
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

('Llama3-8B-SuperNova-Spectrum-Hermes-DPO/tokenizer_config.json',
 'Llama3-8B-SuperNova-Spectrum-Hermes-DPO/special_tokens_map.json',
 'Llama3-8B-SuperNova-Spectrum-Hermes-DPO/tokenizer.json')

In [12]:
# Push them to the HF Hub
model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)
tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/commit/b5be0755fb5235268b7f590e1487780a9109477b', commit_message='Upload tokenizer', commit_description='', oid='b5be0755fb5235268b7f590e1487780a9109477b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO', endpoint='https://huggingface.co', repo_type='model', repo_id='yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO'), pr_revision=None, pr_num=None)

In [None]:
!pip install -qU transformers accelerate

from transformers import AutoTokenizer
import transformers
import torch

model = "yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO"
messages = [{"role": "user", "content": "What is a large language model?"}]

tokenizer = AutoTokenizer.from_pretrained(model)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

tokenizer_config.json:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
