In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_pythia_160m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_pythia_160m_config = RewardConfig(**reward_args)

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Training reward model...


You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
20,0.6362,0.573188,0.893805
40,0.151,0.543783,0.916667
60,0.231,0.345299,0.938865
80,0.2575,0.066204,0.978166
100,0.0597,0.550081,0.956332
120,0.5271,0.664672,0.960526
140,0.5214,0.378779,0.960699
160,0.1214,0.102864,0.951965
180,0.0559,0.209121,0.960699
200,0.4902,0.225018,0.965066




INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True


Map:   0%|                                                                                                                       | 0/801 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 17996.10 examples/s]


INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!


In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


===training policy===


  rewards[[actual_start, actual_end]] += scores


Step,Training Loss,Validation Loss


  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-1740


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_args['output_dir'] = "./pythia-160m-deduped-PPO-80-20"
ppo_pythia_160m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_args['output_dir'] = "./pythia-160m-reward-model-80-20"
reward_pythia_160m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.
Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 24980.02 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 5374.10 examples/s]
Filter: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 20089.20 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...


You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
20,0.8631,0.925891,0.561404
40,0.952,0.443386,0.912664
60,0.7277,0.494859,0.736842
80,0.3501,0.429481,0.938596
100,0.5845,0.36806,0.933628
120,0.8917,0.903885,0.676856
140,0.7135,0.207141,0.951754
160,0.5388,0.207264,0.951965
180,0.7728,0.272379,0.951965
200,0.5669,0.31777,0.930131




INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True


Map:   0%|                                                                                                                       | 0/801 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 29954.77 examples/s]


INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


  rewards[[actual_start, actual_end]] += scores


Step,Training Loss,Validation Loss


  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-2400


# Bias Label (50% accurate, 50% Bias)

In [12]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_args['output_dir'] = "./pythia-160m-deduped-PPO-50-50"
ppo_pythia_160m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_args['output_dir'] = "./pythia-160m-reward-model-50-50"
reward_args['per_device_train_batch_size'] = 8
reward_pythia_160m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.
Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 29225.68 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 5383.51 examples/s]
Filter: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 17535.19 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...


You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
20,0.8136,0.78924,0.368421
40,0.7026,0.699666,0.558559
60,0.7779,1.042507,0.442478
80,0.7947,0.644828,0.582222
100,0.6927,0.842501,0.429204
120,0.6581,0.7266,0.511111
140,0.6766,0.677036,0.583333
160,0.65,0.623807,0.649123
180,0.7037,0.659248,0.568889
200,0.7569,0.837661,0.408889




INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True


Map:   0%|                                                                                                                       | 0/801 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 29340.27 examples/s]

INFO - src.training.rlhf_trainer - Initializing PPOTrainer...





INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


  rewards[[actual_start, actual_end]] += scores


Step,Training Loss,Validation Loss


  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores


INFO - src.training.rlhf_trainer - PPO training complete.


In [13]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-0
