In [1]:
# DPO with Mistral series model as the base model.
# Use DPO to improve open LLMs using Hugging Face TRL, Transformers
# https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/dpo-align-llms-in-2024-with-trl.ipynb

In [2]:
# ENV Setup

In [3]:
# python -m pip install --upgrade pip
# pip install torch==2.1.1+cu118 torchvision torchaudio -f https://download.pytorch.org/whl/cu118/torch_stable.html 
# pip install  --upgrade \
#  "transformers[sentencepiece]==4.37.2" \
#  "datasets==2.16.1" \
#  "accelerate==0.26.1" \
#  "evaluate==0.4.1" \
#  "bitsandbytes==0.42.0" \
#  "trl==0.7.11" \
#  "peft==0.8.2" \
#  "pillow"

# install flash-attn
#pip install ninja packaging
#MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [1]:
import torch

In [2]:
print("\n pytorch version: ", torch.__version__)


 pytorch version:  2.2.2+cu118


In [3]:
print("\n gpu available: ", torch.cuda.is_available())


 gpu available:  True


In [4]:
assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'

In [5]:
print("device capacity: ", torch.cuda.get_device_capability()[0])

device capacity:  8


In [9]:
# HuggingFace Hub

In [6]:
from huggingface_hub import login

login(
  token="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", # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/tianbing_xu/.cache/huggingface/token
Login successful


In [11]:
# prepare dataset

In [7]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load Tokenizer from the hub
model_id = "cognitivecomputations/dolphin-2.1-mistral-7b" # replace with your model id
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load dataset from the hub
dataset = load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned", split="train")
dataset = dataset.shuffle().select(range(13750))


def rec_extract_assistant_messages(messages, index=-1):
  """Recursively extract the last assistant messages from the end of the conversation."""
  if messages[index]["role"] == "assistant":
    return [messages[index]]
  else:
    return rec_extract_assistant_messages(messages, index-1)
    
# System message used if there is no system message at the beginning of the conversation
# Can be repelaced and modified as needed
DEFAULT_SYSTEM_MESSAGE = "You are Dolphin, a helpful AI assistant."

def create_triplets(example, tokenizer, default_system_message=DEFAULT_SYSTEM_MESSAGE):
  """Create the triplets (prompt, chosen, rejected)"""
  # Extract the N-1 turns to form the prompt
  # Prepend a system message if the first message is not a system message
  prompt_messages = example["chosen"][:-1]
  if example["chosen"][0]["role"] != "system":
      prompt_messages.insert(0, {"role": "system", "content": default_system_message})
  # Now we extract the final assistant turn to define chosen/rejected responses 
  chosen_messages = rec_extract_assistant_messages(example["chosen"])
  rejected_messages = rec_extract_assistant_messages(example["rejected"])
  
  # apply template to the messages and return the triplets
  return {
    "prompt": tokenizer.apply_chat_template(prompt_messages, tokenize=False),
    "chosen": tokenizer.apply_chat_template(chosen_messages, tokenize=False),
    "rejected": tokenizer.apply_chat_template(rejected_messages, tokenize=False)
  }

dataset = dataset.map(create_triplets, remove_columns=dataset.features, fn_kwargs={"tokenizer": tokenizer})  
# split dataset into 11,000 training samples and 2,750 test samples
dataset = dataset.train_test_split(test_size=2750/13750)

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

tokenizer_config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/100 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading readme:   0%|          | 0.00/4.46k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/143M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60917 [00:00<?, ? examples/s]

Map:   0%|          | 0/13750 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

9491410

In [8]:
print(dataset["train"][0]["prompt"])

<|im_start|>system
You are Dolphin, a helpful AI assistant.<|im_end|>
<|im_start|>user
Give me a brief scenario of a persona that would search this information and find this content helpful: Master Builders Cost plus residential contracts in NSW - Head contract for work undertaken on a cost plus fixed fee or percentage margin basis.<|im_end|>



In [9]:
print(dataset["train"][0]["chosen"])

<|im_start|>assistant
Meet Sarah, a 35-year-old architect residing in Sydney, New South Wales. Recently, she has started her own small architecture firm, aiming to deliver innovative residential projects to her clients.

Sarah's firm has been growing steadily, and she has just won the commission for a custom-designed residential project for the Taylor family. Emphasizing high-quality craftsmanship, the project has unique requirements that cannot be covered by standard pricing. Due to fluctuating material prices and the nature of custom work, it makes more sense for Sarah and her team to suggest a cost-plus contract for Taylor's dream home construction.

Sarah is determined to find the best resources to manage her project, protect her clients' and firm's interests, and ensure smooth completion. To avoid disputes, she knows it's essential to have a solid contract in place, so she ventures to find information regarding Master Builders Cost-Plus Residential Contracts in NSW.

From her targ

In [10]:
print(dataset["train"][0]["rejected"])

<|im_start|>assistant
Do you need assistance with any queries you may have?<|im_end|>



In [16]:
# DPO

In [17]:
## load dataset

In [11]:
from datasets import load_dataset

# Load jsonl data from disk
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
sub_train_dataset = train_dataset.select(range(1000))

In [13]:
sub_train_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1000
})

In [14]:
sub_eval_dataset = eval_dataset.select(range(100))

In [15]:
sub_eval_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 100
})

In [23]:
## load model mistral dolphin (int-4 using bitsandbytes)

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Hugging Face model id
model_id = "cognitivecomputations/dolphin-2.1-mistral-7b" # replace with your model id

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    use_cache=False, 
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' # to prevent errors with FA
tokenizer.truncation_side = 'left' # to prevent cutting off last generation

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
# The max_prompt_length is the maximum length of the prompt and the max_length is the maximum length of the prompt + chosen or rejected response
max_length = 1024
max_prompt_length = 512

In [26]:
## perf using qlora

In [18]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=64,
        lora_dropout=0.05,
        r=64,
        bias="none",
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
        task_type="CAUSAL_LM", 
)


In [28]:
## training args

In [19]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="doplhin-dpo_2",             # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=2,         # batch size per device during training
    per_device_eval_batch_size=1,           # batch size for evaluation
    gradient_accumulation_steps=1,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    learning_rate=5e-5,                     # 10x higher LR than QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.1,                       # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",             # use cosine learning rate scheduler
    logging_steps=25,                       # log every 25 steps
    save_steps=500,                         # when to save checkpoint
    save_total_limit=2,                     # limit the total amount of checkpoints
    evaluation_strategy="steps",            # evaluate every 1000 steps
    eval_steps=700,                         # when to evaluate
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    push_to_hub=False,                      # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

dpo_args = {
    "beta": 0.1,                            # The beta factor in DPO loss. Higher beta means less divergence
    "loss_type": "sigmoid"                  # The loss type for DPO.
}


In [30]:
## dpo trainer

In [20]:
from trl import DPOTrainer

trainer = DPOTrainer(
    model,
    ref_model=None, # set to none since we use peft
    peft_config=peft_config,
    args=args,
    train_dataset=sub_train_dataset,
    eval_dataset=sub_eval_dataset,
    tokenizer=tokenizer,
    max_length=max_length,
    max_prompt_length=max_prompt_length,
    beta=dpo_args["beta"],
    loss_type=dpo_args["loss_type"],
)


2024-04-12 17:59:17.735891: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-12 17:59:17.774442: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
## training

In [21]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


TrainOutput(global_step=500, training_loss=0.9739822273254395, metrics={'train_runtime': 2681.5067, 'train_samples_per_second': 0.373, 'train_steps_per_second': 0.186, 'total_flos': 0.0, 'train_loss': 0.9739822273254395, 'epoch': 1.0})

In [22]:
# save model at the end of training
trainer.save_model()

In [23]:
## free the memory again
del model
del trainer
torch.cuda.empty_cache()

In [None]:
# load DPO fine-tuned model and inference

In [1]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline 

# Path to saved peft adapter model
# peft_model_id = args.output_dir # or
peft_model_id = "./doplhin-dpo_2" 

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="cuda",
  torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)


2024-04-12 18:46:58.187708: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-12 18:46:58.232733: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [2]:
text = "[INST] Do you have mayonnaise recipes? [/INST]"

# Tokenize the input text
encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False)

# Generate text using the model
generated_ids = model.generate(
    input_ids=encodeds.input_ids.to("cuda"),
    attention_mask=encodeds.attention_mask.to("cuda"),  # Pass attention mask
    max_length=200,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to eos_token_id
)

# Decode the generated tokens
decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# Print the decoded text
print(decoded[0])



[INST] Do you have mayonnaise recipes? [/INST]

YES! Here are three delicious and easy-to-make homemade mayonnaise recipes:

1. Basic Mayonnaise:

Ingredients:
- 1 large egg yolk (at room temperature)
- 1 tablespoon fresh lemon juice
- 1 teaspoon dijon mustard
- 1/4 teaspoon salt
- 1/4 teaspoon ground black pepper
- 1 cup vegetable oil (such as safflower, sunflower, or soybean oil)

Instructions:

1. In a bowl, whisk together the egg yolk, lemon juice, mustard, salt, and black pepper until well combined.
2. Slowly add the vegetable oil, drop by drop, or in a very thin stream, while continuously whisking. As the mayo begins to thicken,


In [3]:
prompts = [
  "A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?",
  "It's Bengay for muscle relief, a combination of methyl salicylate, menthol, and what other active ingredient commonly found in aspirin?",
  "How can i get rid of llamas in my backyard?"
]

In [None]:
for prompt in prompts:
    messages = pipe.tokenizer.apply_chat_template([{"role":"user", "content": prompt}], tokenize=False)
    outputs = pipe(prompt, max_new_tokens=2048, do_sample=True, temperature=1.0, top_k=50, top_p=0.9, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
    print(f"**Prompt**:\n{prompt}\n")
    print(f"**Generated Answer**:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
    print("===" * 10)

**Prompt**:
A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?

**Generated Answer**:
1. First, calculate the perimeter of the rectangular garden:
Perimeter = 2 * (Length + Width)
Perimeter = 2 * (25 feet + 15 feet)
Perimeter = 2 * (40 feet)
Perimeter = 80 feet

So, you will need 80 feet of fencing to build a fence around the entire garden.

Final Answer: 80 feet.

2. Here's another approach to solve this problem:

The perimeter of a rectangle can be calculated using the formula:

Perimeter = 2 * (Length + Width)

For the given dimensions of the garden:

Length = 25 feet
Width = 15 feet

Now, let's calculate the perimeter:

Perimeter = 2 * (25 feet + 15 feet)
                 = 2 * (40 feet)
                 = 80 feet

So, you will need 80 feet of fencing to build a fence around the entire garden.

Final Answer: 80 feet.

For both solutions, the answer is the same: 80 feet o