In [26]:
!pip install -U transformers datasets accelerate peft trl bitsandbytes wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [27]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from trl import ORPOTrainer, ORPOConfig, setup_chat_format
from peft import PeftModel, LoraConfig, get_peft_model, TaskType
import torch
import gc

In [35]:
# Use any base model for training purposes
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [36]:
#Loading the dataset from the hugging face directory
dataset = load_dataset("nz/anthropic-hh-golden-rlhf", split=["train", "test"])

In [11]:
# Shuffling the dataset
train_samples = 5000
test_samples = int((len(dataset[1]) / len(dataset[0])) * train_samples)
train_subset = dataset[0].shuffle(seed=42).select(range(train_samples))
test_subset = dataset[1].shuffle(seed=42).select(range(test_samples))

In [37]:
# Setting up peft config
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["c_attn"],  # Modules to apply LoRA to
    lora_dropout=0.05,  # Dropout probability
    bias="none",  # Bias type
    task_type=TaskType.CAUSAL_LM  # Task type
)

In [38]:
model = get_peft_model(model, lora_config)



In [39]:
orpo_args = ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    do_eval = False,
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    max_steps=25,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=10,
    warmup_steps=10,
    report_to=None,
    output_dir="./results/",
    run_name = "Talal"
)



In [40]:
trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset[0],
    eval_dataset=dataset[1],
    peft_config=lora_config,
    processing_class=tokenizer
)



Map:   0%|          | 0/42537 [00:00<?, ? examples/s]

Map:   0%|          | 0/42537 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1034 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/2312 [00:00<?, ? examples/s]

Map:   0%|          | 0/2312 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [24]:
import os
os.environ["WANDB_MODE"] = "disabled"

In [41]:
trainer.train()
trainer.save_model("orpo_trained")

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
5,No log,3.584364,25.5989,90.316,45.158,-0.353686,-0.294701,0.223616,-0.058985,-2.947007,-3.536859,-67.194153,-67.841415,3.470409,-1.139549,-0.622884
10,3.447500,3.583815,25.6409,90.168,45.084,-0.353633,-0.294675,0.224048,-0.058958,-2.946754,-3.536331,-67.193787,-67.840691,3.469879,-1.139367,-0.622604
15,3.447500,3.583111,25.6381,90.178,45.089,-0.353564,-0.294642,0.224048,-0.058922,-2.946417,-3.535635,-67.193558,-67.839951,3.469198,-1.139129,-0.622239
20,3.486500,3.582675,25.661,90.098,45.049,-0.353519,-0.29462,0.224048,-0.0589,-2.946198,-3.535194,-67.193535,-67.839561,3.468777,-1.138983,-0.622014
25,3.486500,3.582515,25.8155,89.559,44.779,-0.353503,-0.294611,0.223616,-0.058891,-2.946112,-3.535026,-67.193733,-67.839607,3.468621,-1.138927,-0.621929


In [45]:
import torch

# Check if CUDA (GPU) is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [48]:
tokenizer = AutoTokenizer.from_pretrained("./orpo_trained")
model = AutoModelForCausalLM.from_pretrained("./orpo_trained")
model.to(device)
# Define the input prompt
input_prompt = "what is"

# Encode the prompt with an attention mask
inputs = tokenizer(
    input_prompt,
    return_tensors="pt",
    padding=True,
    truncation=True,
    return_attention_mask=True
)

# Move input tensors to GPU if available
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# Generate text with attention mask
output = model.generate(
    input_ids,
    attention_mask=attention_mask,  # Provide the attention mask
    max_length=200,  # Adjust as needed
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7
)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


fuck‚Ä≠‚Äù, but I'm still not sure how to make a difference.
