In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-31m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_31m.model is not None)
print("Tokenizer loaded:", pythia_31m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_pythia_31m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_pythia_31m_config = RewardConfig(**reward_args)

In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 13534.37 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 5762.06 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...




Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.6432,0.589549,7468.0,0.554988,3.554927,7.258351,0.655172,1.353178
40,0.5127,0.551176,15118.0,0.567768,3.701685,7.67201,0.702586,1.55422
60,0.4587,0.540313,23396.0,-0.014257,3.394298,7.686827,0.741379,1.72724
80,0.6412,0.508053,31512.0,1.28993,4.193081,7.896821,0.758621,1.521773
100,0.4593,0.450482,39131.0,2.184873,4.764286,7.884968,0.831897,1.42233
120,0.427,0.453844,46204.0,3.803071,5.611193,7.807112,0.818966,1.044585
140,0.4389,0.416011,54080.0,2.247845,4.765608,7.791487,0.801724,1.508654
160,0.3166,0.430425,61840.0,1.111968,4.193507,8.00889,0.827586,1.907083
180,0.3604,0.38656,69715.0,0.879571,4.186415,8.051724,0.849138,2.076123
200,0.3457,0.331997,77707.0,0.920584,4.31596,8.024784,0.883621,2.310378


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!




In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-2150


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_args['output_dir'] = "./pythia-31m-PPO-80-20"
ppo_pythia_31m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_args['output_dir'] = "./pythia-31m-reward-model-80-20"
reward_pythia_31m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 6684.95 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 3130.00 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7168,0.558782,7468.0,-17.174569,-15.620959,-13.720905,0.62069,0.627155
40,0.5839,0.592987,15118.0,-16.182112,-15.121094,-13.880388,0.599138,0.340787
60,0.7441,0.554262,23396.0,-17.226293,-15.747575,-13.981681,0.625,0.587823
80,0.521,0.549973,31512.0,-18.273707,-16.355065,-13.932112,0.637931,0.849677
100,0.5792,0.538451,39131.0,-18.831897,-16.666218,-13.931034,0.706897,1.024246
120,1.0712,0.520667,46204.0,-17.172414,-15.623384,-13.786638,0.728448,0.686422
140,0.7292,0.593875,54080.0,-15.701509,-14.809402,-13.835129,0.719828,0.301994
160,0.5816,0.550713,61840.0,-15.884698,-14.856816,-13.674569,0.741379,0.444235
180,0.687,0.495883,69715.0,-17.913793,-15.879445,-13.386853,0.75431,1.00458
200,0.7579,0.49594,77707.0,-16.31681,-15.0691,-13.507543,0.771552,0.661369


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===




Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-0


# Bias Label (50% accurate, 50% Bias)

In [12]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_args['output_dir'] = "./pythia-31m-PPO-50-50"
ppo_pythia_31m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_args['output_dir'] = "./pythia-31m-reward-model-50-50"
reward_pythia_31m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

ppo_trainer.train()

print(f"Best checkpoint: {ppo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {ppo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 11891.34 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 6883.45 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.758,0.995378,7468.0,-14.465517,-13.320043,-12.372845,0.353448,-0.387392
40,0.9115,0.927476,15118.0,-14.614224,-13.605738,-12.704741,0.336207,-0.302532
60,0.7886,0.692928,23396.0,-15.188578,-14.474138,-13.730603,0.512931,0.060345
80,0.7388,0.688002,31512.0,-15.279095,-14.571525,-13.891164,0.521552,0.076778
100,0.6801,0.674106,39131.0,-15.716595,-15.007812,-14.271552,0.543103,0.105065
120,0.7384,0.874357,46204.0,-15.144397,-14.172144,-13.4375,0.353448,-0.234375
140,0.7311,0.875911,54080.0,-14.956897,-14.119073,-13.400862,0.318966,-0.251078
160,0.6944,0.78584,61840.0,-15.173491,-14.519127,-13.928879,0.387931,-0.122306
180,1.0922,0.582879,69715.0,-17.321121,-15.792699,-13.868534,0.573276,0.624731
200,0.6746,0.968061,77707.0,-14.920259,-13.85062,-13.056034,0.288793,-0.379041


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===




Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.
Best checkpoint: None
Best metric: None


In [None]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-0


: 