In [1]:
!pip install -U transformers datasets accelerate peft trl bitsandbytes wandb


Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m858.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.9.3-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.3-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transforme

In [2]:
import gc
import os

import torch
import wandb
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import ORPOConfig, ORPOTrainer, setup_chat_format


In [None]:
# cli login
from huggingface_hub import login
login(token="your_hugging_face_token")

In [3]:
import wandb
api_key="your_wandb_api_key"
wandb.login(key=api_key)
wandb.init(
    project='llama-3-8b-instruct'
 )

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mirenicquasar[0m ([33mredtree[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16


In [5]:
# Model
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
new_model = "OrpoLlama-3-8B-Instruct"

# QLoRA config (removed this)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=64,#changed from 16
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    #quantization_config=bnb_config,  # removed quantization
    device_map="auto",
    attn_implementation=attn_implementation
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [6]:
import pandas as pd

In [7]:
dataset_name = "argilla/distilabel-math-preference-dpo"
dataset = load_dataset(dataset_name, split="all")
# dataset = dataset.shuffle(seed=42).select(range(1000))


Downloading readme:   0%|          | 0.00/805 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.86M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2418 [00:00<?, ? examples/s]

In [8]:
from datasets import load_dataset

# Load your dataset
dataset_name = "argilla/distilabel-math-preference-dpo"
dataset = load_dataset(dataset_name, split="all")

# Define the line to remove
line_to_remove = "Take a deep breath, think step by step, and give an accurate response"

# Function to remove the line from a given instruction
def remove_line(examples):
    examples['instruction'] = [instruction[: -len(line_to_remove)].strip() if instruction.endswith(line_to_remove) else instruction for instruction in examples['instruction']]
    return examples

# Apply the function to the dataset
dataset = dataset.map(remove_line, batched=True)

# Save the updated dataset if needed
# This is optional and depends on your requirements
# dataset.save_to_disk('path_to_your_updated_dataset')

# Example: Showing the first few rows to verify changes
print(dataset['instruction'][:5])


Map:   0%|          | 0/2418 [00:00<?, ? examples/s]

['How can I simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`?.', 'What is the result of substituting `x = 5` and `y = -2` into the expression `2x^2 - 3y + 7`?.', 'Detail the process of factoring the algebraic expression `4x^2 + 12xy + 9y^2`..', 'Explain the steps involved in solving the equation `3x + 2y = 10` for `x`..', 'Can you provide a step-by-step solution for evaluating the expression `5(x + 2) - 3(2x - 1)`?.']


In [9]:
df = pd.DataFrame(dataset)

In [10]:
df.head(2)

Unnamed: 0,metadata,instruction,chosen_response,chosen_rating,rejected_response,rejected_rating
0,"{""length-input"": 139, ""length-generations-1"": ...",How can I simplify the algebraic expression `(...,To simplify the algebraic expression `(3x^2 - ...,9.0,To simplify the algebraic expression `(3x^2 - ...,7.0
1,"{""length-input"": 164, ""length-generations-1"": ...",What is the result of substituting `x = 5` and...,To find the result of substituting `x = 5` and...,9.0,To find the result of substituting `x = 5` and...,8.0


In [11]:
# def convert_format(dataset):
#     converted_data = []
#     for row in dataset:
#         new_row = {
#             "chosen": [
#                 {"content": row["instruction"], "role": "user"},
#                 {"content": row["chosen_response"], "role": "assistant"}
#             ],
#             "rejected": [
#                 {"content": row["instruction"], "role": "user"},
#                 {"content": row["rejected_response"], "role": "assistant"}
#             ],
#             "prompt": row["instruction"]
#         }
#         converted_data.append(new_row)
#     return converted_data

def convert_format(dataset):
    converted_data = []
    for row in dataset:
        # Remove any extraneous newline characters from the instruction and responses
        instruction = row["instruction"].replace('\n', ' ').strip()
        chosen_response = row["chosen_response"].replace('\n', ' ').strip()
        rejected_response = row["rejected_response"].replace('\n', ' ').strip()
        
        new_row = {
            "chosen": [
                {"role": "user", "content": instruction},
                {"role": "assistant", "content": chosen_response}
            ],
            "rejected": [
                {"role": "user", "content": instruction},
                {"role": "assistant", "content": rejected_response}
            ],
            "prompt": instruction
        }
        converted_data.append(new_row)
    return converted_data

# Example usage
converted_dataset = convert_format(dataset)


In [12]:
converted_dataset = convert_format(dataset)

In [13]:
df = pd.DataFrame(converted_dataset)

In [14]:
df.head(2)

Unnamed: 0,chosen,rejected,prompt
0,[{'content': 'How can I simplify the algebraic...,[{'content': 'How can I simplify the algebraic...,How can I simplify the algebraic expression `(...
1,[{'content': 'What is the result of substituti...,[{'content': 'What is the result of substituti...,What is the result of substituting `x = 5` and...


In [15]:
import datasets
dataset =  datasets.Dataset.from_pandas(df)

In [16]:
def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)
dataset = dataset.train_test_split(test_size=0.01)

Map (num_proc=64):   0%|          | 0/2418 [00:00<?, ? examples/s]

In [21]:
orpo_args = ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=1024,
    per_device_train_batch_size=16, #8
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8, #4
    optim="paged_adamw_8bit",
    num_train_epochs=4, #1
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    output_dir="./results/",
    run_name="OrpoLlama-3-8B-Instruct-run",
    remove_unused_columns=False
)

trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    tokenizer=tokenizer,
)
trainer.train()
trainer.save_model(new_model)


Map:   0%|          | 0/2393 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
15,1.4222,1.425676,18.4639,1.354,0.704,-0.121866,-0.123227,0.423077,0.001361,-1.232269,-1.218663,-1.591821,-1.548515,1.353387,-0.687542,0.023614
30,1.2626,1.219007,18.4579,1.354,0.704,-0.099674,-0.100326,0.5,0.000652,-1.003256,-0.996737,-1.650117,-1.620731,1.143226,-0.69143,0.015021
45,1.0593,1.064004,18.4634,1.354,0.704,-0.082929,-0.08326,0.5,0.000331,-0.832597,-0.829287,-1.748993,-1.715217,0.986157,-0.694277,0.010615
60,1.0282,1.00466,18.4576,1.354,0.704,-0.076794,-0.076999,0.423077,0.000205,-0.769989,-0.767935,-1.757044,-1.722201,0.925486,-0.695705,0.008772
75,1.0685,0.990869,18.4568,1.355,0.704,-0.07546,-0.075596,0.423077,0.000136,-0.755959,-0.754601,-1.755697,-1.721189,0.911575,-0.696588,0.007305



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.


In [22]:
import torch

# Check for CUDA availability
if torch.cuda.is_available():
  device = torch.device("cuda:0")  # Assuming you want the first GPU
  properties = torch.cuda.get_device_properties(device)
  total_memory_gb = properties.total_memory / (1024**3)
  print(f"Total GPU memory: {total_memory_gb:.2f} GB")
else:
  print("No CUDA GPU available!")


Total GPU memory: 79.26 GB


In [23]:
import torch

num_gpus = torch.cuda.device_count()
print("Number of GPUs:", num_gpus)


Number of GPUs: 1


In [24]:
print("training completed")

training completed


In [28]:
# Flush memory
del model
gc.collect()
torch.cuda.empty_cache()

# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model, tokenizer = setup_chat_format(model, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

model.push_to_hub(new_model, use_temp_dir=False)        #removed hf token
tokenizer.push_to_hub(new_model, use_temp_dir=False)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/wizard-who-wonders/OrpoLlama-3-8B/commit/d2e3ed82adb5d1d379bcb816a7dd3c72f27608d9', commit_message='Upload tokenizer', commit_description='', oid='d2e3ed82adb5d1d379bcb816a7dd3c72f27608d9', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
#inference
test_prompts = ["Hello"]
for i in range(len(test_prompts)):
    test_prompts[i] = {"content": test_prompts[i],"role": "user"}
pipe = pipeline(
    "text-generation",
    model=model,
    model_kwargs={"torch_dtype": torch.bfloat16},
    tokenizer=tokenizer
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": True,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(test_prompts,**generation_args)
print(output[0]['generated_text'])

In [None]:
#eval
!pip install lm_eval

In [None]:
import lm_eval
model_eval = lm_eval.models.huggingface.HFLM(pretrained=model, tokenizer=tokenizer)
result  = lm_eval.evaluator.simple_evaluate(model_eval, tasks=["gsm8k"], device = 'gpu',num_fewshot=0, batch_size=8)['results']
print(result)