In [1]:
import argparse
import torch
import numpy as np
import random
import os
os.environ['BNB_CUDA_VERSION'] = '118' # need to use Cuda 11.8
os.environ['LD_LIBRARY_PATH']= '/usr/local/cuda-11/lib64' # path to your cuda-11 lib64

# import required modules:

# from exp.exp_model import Exp_Model
from data_load.dataloader import DataLoader
from explain_module.util import summarize_trial, remove_reflections, save_results#, save_agents
from explain_module.agents import PredictReflectAgent
from predict_module.merge_peft_adapter import merge_peft_adapter
from predict_module.supervised_finetune import supervised_finetune
from predict_module.train_reward_model import train_reward_model
from predict_module.tuning_lm_with_rl import tuning_lm_with_rl
from transformers import LlamaTokenizer, pipeline #, AutoModelForCausalLM, BitsAndBytesConfig
from trl import AutoModelForCausalLMWithValueHead
import os, json
import pandas as pd

  torch.utils._pytree._register_pytree_node(

BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64
Loading CUDA version: BNB_CUDA_VERSION=118


  warn((f'\n\n{"="*80}\n'


In [2]:
# enter your openai api key
os.environ["OPENAI_API_KEY"] = 'enter_your_openai_api_key_here' # for openai api_key in 'tenacity'

# set the random seed (you set your own seed)
fix_seed = 100
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)

In [3]:
parser = argparse.ArgumentParser(description='generating')

# load data
parser.add_argument("--price_dir", type=str, default="data/price/preprocessed/")
parser.add_argument("--tweet_dir", type=str, default="data/tweet/raw/")
parser.add_argument("--seq_len", type=int, default=5)

# supervised finetuning
parser.add_argument("--wandb", action="store_true", default=False)
parser.add_argument("--data_path", type=str, default="./data/merge_sample.json")
parser.add_argument("--output_path", type=str, default="./saved_models/lora-Vicuna")
parser.add_argument("--model_path", type=str, default="lmsys/vicuna-7b-v1.5-16k")   
parser.add_argument("--eval_steps", type=int, default=200)
parser.add_argument("--save_steps", type=int, default=200)
parser.add_argument("--resume_from_supervised_checkpoint", type=str, default=None)
parser.add_argument("--ignore_data_skip", type=str, default="False")

# training reward model
parser.add_argument("--num_reflect_trials", type=int, default=2)
parser.add_argument("--datasets_dir", type=str, default="./datasets/")
parser.add_argument('--local_rank', type=int, default=0, help="Used for multi-gpu")
parser.add_argument('--resume_from_reward_checkpoint', type=bool, default=False, help="If you want to resume training where it left off.")
parser.add_argument('--deepspeed', type=str, default=None, help="Path to deepspeed config if using deepspeed. You may need this if the model that you want to train doesn't fit on a single GPU.")
parser.add_argument('--per_device_train_batch_size', type=int, default=1)
parser.add_argument('--per_device_eval_batch_size', type=int, default=1)
parser.add_argument('--reward_gradient_accumulation_steps', type=int, default=32)
parser.add_argument('--reward_learning_rate', type=float, default=2e-5)
parser.add_argument('--weight_decay', type=int, default=0.001)
parser.add_argument('--reward_base_model', type=str, default="lmsys/vicuna-7b-v1.5-16k", help="The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc.")
parser.add_argument('--bf16', type=bool, default=False, help="This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU.")
parser.add_argument('--num_train_epochs', type=int, default=1, help="The number of training epochs for the reward model.")
parser.add_argument('--train_subset', type=int, default=100000, help="The size of the subset of the training data to use")
parser.add_argument('--eval_subset', type=int, default=50000, help="The size of the subset of the eval data to use")
parser.add_argument('--gradient_checkpointing', type=bool, default=False, help="Enables gradient checkpointing.")
parser.add_argument('--optim', type=str, default="adamw_hf", help="Enables gradient checkpointing.")
parser.add_argument('--lr_scheduler_type', type=str, default="linear", help="The lr scheduler")
parser.add_argument('--reward_adapter', type=str, default="./saved_models/reward_model_vicuna-7b")

# reinforcement learning
parser.add_argument('--rl_base_model', type=str, default="./saved_models/lora-Vicuna-adapter-merged", help="the model name")
parser.add_argument('--tokenizer_name', type=str, default="lmsys/vicuna-7b-v1.5-16k", help="the tokenizer name")
parser.add_argument('--reward_model_name', type=str, default="./saved_models/reward_model_vicuna-7b-adapter-merged", help="the reward model name")
parser.add_argument('--log_with', type=str, default=None, help="use 'wandb' to log with wandb")
parser.add_argument('--rl_learning_rate', type=float, default=1.4e-5, help="the learning rate")
parser.add_argument('--output_max_length', type=int, default=128, help="maximum length for generation")
parser.add_argument('--mini_batch_size', type=int, default=1, help="the PPO minibatch size")
parser.add_argument('--batch_size', type=int, default=1, help="the batch size")
parser.add_argument('--ppo_epochs', type=int, default=4, help="the number of ppo epochs")
parser.add_argument('--rl_gradient_accumulation_steps', type=int, default=1, help="the number of gradient accumulation steps")
parser.add_argument('--adafactor', type=bool, default=False, help="whether to use the adafactor optimizer")
parser.add_argument('--early_stopping', type=bool, default=True, help="whether to early stop")
parser.add_argument('--target_kl', type=float, default=0.1, help="kl target for early stopping")
parser.add_argument('--reward_baseline', type=float, default=0, help="a baseline value that is subtracted from the reward")
parser.add_argument('--batched_gen', type=bool, default=True, help="whether to use the batched text gen")
parser.add_argument('--save_freq', type=int, default=None, help="n steps to save the model")
parser.add_argument('--output_dir', type=str, default="./saved_models/tuning_llama_rl_checkpoints/", help="directory to save the model")
parser.add_argument('--seed', type=int, default=0, help="the seed")
parser.add_argument('--comparison_data_path', type=str, default='./datasets/comparison_data.json')
parser.add_argument('--learning_rate', type=float, default=1.4e-5)

# for evaluation
parser.add_argument("--num_shots", type=int, default=4)
parser.add_argument("--save_dir", type=str, default="results/")

llama_model_path="lmsys/vicuna-7b-v1.5-16k" # 'Llama_models/vicuna-7b-v1.5-16k' # "lmsys/vicuna-7b-v1.5-16k"
args = parser.parse_args(args=[
    '--model_path',llama_model_path,
    '--reward_base_model',llama_model_path,
    '--tokenizer_name',llama_model_path,
    '--price_dir',"data/price/preprocessed/", # "data/sample_price/preprocessed/",
    '--tweet_dir',"data/tweet/raw/", # "data/sample_tweet/raw/",
    '--adafactor',"True",
    '--ppo_epochs','20',
    '--output_max_length','128',
    ])
print('Args in experiment:')
print(args)

dataloader = DataLoader(args)

Args in experiment:
Namespace(price_dir='data/price/preprocessed/', tweet_dir='data/tweet/raw/', seq_len=5, wandb=False, data_path='./data/merge_sample.json', output_path='./saved_models/lora-Vicuna', model_path='lmsys/vicuna-7b-v1.5-16k', eval_steps=200, save_steps=200, resume_from_supervised_checkpoint=None, ignore_data_skip='False', num_reflect_trials=2, datasets_dir='./datasets/', local_rank=0, resume_from_reward_checkpoint=False, deepspeed=None, per_device_train_batch_size=1, per_device_eval_batch_size=1, reward_gradient_accumulation_steps=32, reward_learning_rate=2e-05, weight_decay=0.001, reward_base_model='lmsys/vicuna-7b-v1.5-16k', bf16=False, num_train_epochs=1, train_subset=100000, eval_subset=50000, gradient_checkpointing=False, optim='adamw_hf', lr_scheduler_type='linear', reward_adapter='./saved_models/reward_model_vicuna-7b', rl_base_model='./saved_models/lora-Vicuna-adapter-merged', tokenizer_name='lmsys/vicuna-7b-v1.5-16k', reward_model_name='./saved_models/reward_mode

### 1. train_supervised.py

In [4]:
# load data

# Collect demonstration data
print("Loading Train Agents...")

data_path='data/customized_df.h5'
if os.path.isfile(data_path):
    data = pd.read_hdf(data_path, 'data')
else:
    # This part costs money!!
    os.environ["OPENAI_API_KEY"] = 'enter_your_openai_api_key_here' # for openai api_key in 'tenacity'
    data = dataloader.load(flag="train")
    data.to_hdf(data_path, key='data', mode='w')

agent_cls = PredictReflectAgent
agents = [agent_cls(row['ticker'], row['summary'], row['target']) for _, row in data.iterrows()]
print("Loaded Train Agents.")

Loading Train Agents...
Loaded Train Agents.


In [5]:
# # This part costs money!!
# os.environ["OPENAI_API_KEY"] = 'enter_your_openai_api_key_here' # for openai api_key in 'tenacity'
    
for agent in agents:
    agent.run() # use chatgpt to check the current prediction

    if agent.is_correct():
        prompt = agent._build_agent_prompt()
        response = agent.scratchpad.split('Price Movement: ')[-1]

        sample = {"instruction": prompt, "input": "", "output": response}
        with open(args.data_path, 'a') as f:
            f.write(json.dumps(sample) + "\n")

correct, incorrect = summarize_trial(agents)
print(f'Finished Trial 0, Correct: {len(correct)}, Incorrect: {len(incorrect)}')


Facts:
2025-07-19
- Analysts have raised the price target for NVDA to $192 due to the continued data center boom.
- Huawei has introduced a new AI compute platform that challenges Nvidia in the market.
- Josh Brown suggested selling NVDA shares, citing a reason for doing so.
- There are stock price predictions for NVIDIA for the year 2025, with different opinions on where it may stand.
- There is speculation on the surge in market cap and AI leadership for NVDA stock.
- NVDA stock is being discussed in relation to competitors like AMD and INTC.
- Morgan Stanley named Nvidia as a top pick ahead of earnings.
- Jim Cramer predicts a good quarter for NVIDIA.
- BofA raised the price target for NVDA based on China sales outlook.
- Dan Niles turned bullish on NVIDIA for certain reasons.
- Needham maintains a buy rating on NVDA and raises the target to $200.
- Goldman Sachs gave a buy rating for NVDA, with analysts bullish on the stock.
- There are discussions on the best stocks to buy now, in

In [6]:
# Train supervised policy
supervised_finetune(args)
merge_peft_adapter(model_name=args.output_path, output_name=args.rl_base_model)


lmsys/vicuna-7b-v1.5-16k


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)





  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


peft_config:  LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='lmsys/vicuna-7b-v1.5-16k', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'v_proj', 'q_proj'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `None` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


In [None]:
# Collect comparison data
comparison_data = []

# os.environ["OPENAI_API_KEY"] = 'enter_your_openai_api_key_here' # for openai api_key in 'tenacity'
    
# This part costs money!!
for trial in range(args.num_reflect_trials):
    for idx, agent in enumerate([a for a in agents if not a.is_correct()]):
        prev_response = agent.scratchpad.split('Price Movement: ')[-1]
        agent.run()

        if agent.is_correct():
            print(agent._build_agent_prompt(), "\n\n\n")
            prompt = remove_reflections(agent._build_agent_prompt())
            response = agent.scratchpad.split('Price Movement: ')[-1]

            sample = {"user_input": prompt, "completion_a": prev_response, "completion_b": response}
            comparison_data.append(sample)
            # print(sample)

    correct, incorrect = summarize_trial(agents)
    print(f'Finished Trial {trial+1}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')

os.makedirs(args.datasets_dir, exist_ok=True)
comparison_data_path = os.path.join(args.datasets_dir, "comparison_data.json")

if comparison_data:
    with open(comparison_data_path, 'w') as f:
        f.write(json.dumps(comparison_data))

Facts:
2025-07-19
- Analysts have raised the price target for NVDA to $192 due to the continued data center boom.
- Huawei has introduced a new AI compute platform that challenges Nvidia in the market.
- Josh Brown suggested selling NVDA shares, citing a reason for doing so.
- There are stock price predictions for NVIDIA for the year 2025, with different opinions on where it may stand.
- There is speculation on the surge in market cap and AI leadership for NVDA stock.
- NVDA stock is being discussed in relation to competitors like AMD and INTC.
- Morgan Stanley named Nvidia as a top pick ahead of earnings.
- Jim Cramer predicts a good quarter for NVIDIA.
- BofA raised the price target for NVDA based on China sales outlook.
- Dan Niles turned bullish on NVIDIA for certain reasons.
- Needham maintains a buy rating on NVDA and raises the target to $200.
- Goldman Sachs gave a buy rating for NVDA, with analysts bullish on the stock.
- There are discussions on the best stocks to buy now, in

In [6]:
# clear GPU memory
with torch.no_grad():
    torch.cuda.empty_cache()

### 2. train_reward_model.py

In [8]:
# Train reward model
train_reward_model(args)
merge_peft_adapter(model_name=args.reward_adapter, output_name=args.reward_model_name)


dataset_name:  ./datasets/
device_map:  auto


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at lmsys/vicuna-7b-v1.5-16k and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,198,400 || all params: 6,611,546,112 || trainable%: 0.0635010318143267
train_dataset:  6
train_dataset:  6
eval_dataset:  6
eval_dataset:  6


  return fn(*args, **kwargs)
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


Saving last checkpoint of the model
peft_config:  LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='lmsys/vicuna-7b-v1.5-16k', revision=None, task_type='SEQ_CLS', inference_mode=True, r=8, target_modules={'q_proj', 'v_proj'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `None` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


In [5]:
# clear GPU memory
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

### 3. train_rl.py

In [6]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training # prepare_model_for_int8_training
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline, BitsAndBytesConfig
from transformers import LlamaTokenizer, LlamaConfig, LlamaForSequenceClassification, LlamaForCausalLM

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
import os


# DEFAULT_PAD_TOKEN = "[PAD]"
# DEFAULT_EOS_TOKEN = "</s>"
# DEFAULT_BOS_TOKEN = "</s>"
# DEFAULT_UNK_TOKEN = "</s>"

tqdm.pandas()


script_args = args

reward_model_name = script_args.reward_model_name

# dataset_name = "lvwerra/stack-exchange-paired"
dataset_name = script_args.datasets_dir
print("dataset_name: ", dataset_name)

config = PPOConfig(
    model_name=script_args.rl_base_model,
    learning_rate=script_args.rl_learning_rate,
    log_with=script_args.log_with,
    batch_size=script_args.batch_size,
    mini_batch_size=script_args.mini_batch_size,
    gradient_accumulation_steps=script_args.rl_gradient_accumulation_steps,
    optimize_cuda_cache=True,
    early_stopping=script_args.early_stopping,
    target_kl=script_args.target_kl,
    ppo_epochs=script_args.ppo_epochs,
    seed=script_args.seed,
)

# train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train")
# train_dataset = train_dataset.select(range(100000))
train_dataset = load_dataset(dataset_name, split="train")
# train_dataset = train_dataset.select(range(100))
# We then define the arguments to pass to the sentiment analysis pipeline.
# We set `return_all_scores` to True to get the sentiment score for each token.
# sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16, "truncation": True}
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 1, "truncation": True}

# tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)
# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.

if "llama" in script_args.tokenizer_name or "vicuna" in script_args.rl_base_model or "Vicuna" in script_args.rl_base_model:
    tokenizer = LlamaTokenizer.from_pretrained(script_args.tokenizer_name)
else:
    tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)


# if "llama" in script_args.tokenizer_name or "vicuna" in script_args.rl_base_model or "Vicuna" in script_args.rl_base_model:
#     tokenizer.add_special_tokens(
#         {
#             "eos_token": DEFAULT_EOS_TOKEN,
#             "bos_token": DEFAULT_BOS_TOKEN,
#             "unk_token": DEFAULT_UNK_TOKEN,
#             "pad_token": DEFAULT_PAD_TOKEN,
#         }
#     )
# else:
#     tokenizer.pad_token = tokenizer.eos_token


# Below is an example function to build the dataset. In our case, we use the IMDB dataset
# from the `datasets` library. One should customize this function to train the model on
# its own dataset.
def build_dataset(
    tokenizer, dataset_name="lvwerra/stack-exchange-paired", input_min_text_length=2, input_max_text_length=8
):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """

    # load imdb with datasets
    # ds = load_dataset(dataset_name, data_dir="data/rl", split="train")
    ds = load_dataset(dataset_name, split="train")
    original_columns = ds.column_names
    num_proc = 1 #24

    def preprocess_function(examples):
        new_examples = {
            "query": [],
            "input_ids": [],
        }
        # for question in examples["question"]:
        for question in examples["user_input"]:
            query = "Question: " + question + "\n\nAnswer: "
            tokenized_question = tokenizer(query, truncation=True)
            new_examples["query"].append(query)
            new_examples["input_ids"].append(tokenized_question["input_ids"])

        return new_examples

    ds = train_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
        remove_columns=original_columns,
    )
    # ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False)

    ds.set_format(type="torch")
    return ds

# We retrieve the dataloader by calling the `build_dataset` function.
# dataset = build_dataset(tokenizer)
dataset = build_dataset(tokenizer, dataset_name=dataset_name)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])


# set seed before initializing value head for deterministic eval
set_seed(config.seed)

# Now let's build the model, the reference model, and the tokenizer.
current_device = Accelerator().local_process_index

lora_config = LoraConfig(
    r=8, #16, # Oli
    lora_alpha=16, #32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name,
    # load_in_4bit=True, # Oli
    # device_map={"": current_device},
    device_map="auto",
    peft_config=lora_config,
    # layer_norm_names=[],
    # torch_dtype=torch.float16, # Oli
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,  # Oli
        bnb_4bit_quant_type="nf4", # Oli
        bnb_4bit_use_double_quant=True, # Oli
        # bnb_4bit_compute_dtype=torch.bfloat16, # Oli
        # bnb_4bit_quant_storage=torch.bfloat16, # Oli
        llm_int8_enable_fp32_cpu_offload=True) 
)
# model = AutoModelForCausalLMWithValueHead.from_pretrained(
# config.model_name,
# load_in_4bit=True,
# # device_map={"": current_device},
# device_map="auto",
# peft_config=lora_config,
# # layer_norm_names=[],
# # torch_dtype=torch.float16,
# quantization_config=BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True))
print("finetune model: ", config.model_name, type(model))
print("finetune model's is_loaded_in_4bit: ", model.is_loaded_in_4bit)

optimizer = None
if script_args.adafactor:
    optimizer = Adafactor(
        filter(lambda p: p.requires_grad, model.parameters()),
        scale_parameter=False,
        relative_step=False,
        warmup_init=False,
        lr=config.learning_rate,
    )
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
    config,
    model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer,
)

# We then build the sentiment analysis pipeline, passing the model name and the
# sentiment analysis pipeline arguments. Let's also make sure to set the device
# to the same device as the PPOTrainer.
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug
print("device: ", device)


print("reward_model_name: ", reward_model_name)
#! my self code to try peft reward model
# reward_model = LlamaForSequenceClassification.from_pretrained(
#     reward_model_name,
#     load_in_4bit=True,
#     device_map="auto",
#     torch_dtype=torch.float16,
# )
# print("reward_model: ", type(reward_model))
# print("reward_model is_loaded_in_4bit: ", reward_model.is_loaded_in_4bit)

# reward_model = prepare_model_for_int8_training(reward_model)
# reward_model_config = LlamaConfig.from_pretrained(reward_model_name)

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model=reward_model_name,
    # model=reward_model,
    # config=reward_model_config,
    device_map="auto",
    # TypeError: LlamaForSequenceClassification.__init__() got an unexpected keyword argument 'peft_config'
    model_kwargs={"load_in_4bit": True},
    tokenizer=tokenizer,
    # torch_dtype=torch.float16,
)

# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.
generation_kwargs = {
    # "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000,
}
output_min_length = 32
output_max_length = script_args.output_max_length
output_length_sampler = LengthSampler(output_min_length, output_max_length)


dataset_name:  ./datasets/


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

finetune model:  ./saved_models/lora-Vicuna-adapter-merged <class 'trl.models.modeling_value_head.AutoModelForCausalLMWithValueHead'>
finetune model's is_loaded_in_4bit:  True
device:  0
reward_model_name:  ./saved_models/reward_model_vicuna-7b-adapter-merged


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./saved_models/reward_model_vicuna-7b-adapter-merged and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    
    # if len(batch['query'][0])>7000: # limit of my A5000 GPU
    #     continue

    # print(batch) # if want to take a look at the query

    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )
    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

    # Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs]

    # Run PPO step
    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
        ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")

    ppo_trainer.save_pretrained(script_args.output_dir + "step_saved")

6it [00:00, 696.61it/s]


In [8]:
ppo_trainer.save_pretrained(args.output_dir + "final")
merge_peft_adapter(model_name=args.output_dir + "final", output_name="./saved_models/sep_model", RF_finetune=True)



peft_config:  LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='./saved_models/lora-Vicuna-adapter-merged', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'v_proj', 'q_proj'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]