In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-70m",
        "learning_rate": 5e-6,
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_70m.model is not None)
print("Tokenizer loaded:", pythia_70m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['ppo_pythia_70m_config']
ppo_pythia_70m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['pythia_70m_reward_config']
reward_pythia_70m_config = RewardConfig(**reward_args)

In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-70m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_70m.tokenizer.pad_token_id
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_70m.model, 
    reward_model_base="EleutherAI/pythia-70m-deduped", 
    reward_model_config=reward_pythia_70m_config,
    value_model=value_model, 
    processing_class=pythia_70m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_70m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 11789.53 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 10578.73 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...




Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.6614,0.686146,7468.0,-1.093424,2.49606,6.660022,0.650862,1.37899
40,0.553,0.718294,15118.0,3.258763,5.386189,7.01347,0.698276,0.370031
60,0.4947,0.540125,23396.0,3.371767,5.176286,7.192349,0.771552,0.815531
80,0.508,0.484493,31512.0,4.67861,6.205482,7.84806,0.806034,0.863281
100,0.3157,0.384214,39131.0,4.873114,6.512258,8.407328,0.896552,1.220366
120,0.2381,0.375372,46204.0,4.34274,6.731807,9.544181,0.836207,1.680647
140,0.3646,0.333177,54080.0,3.777074,6.827435,10.117457,0.87931,2.098296
160,0.0996,0.26836,61840.0,4.977842,8.221166,11.788254,0.918103,2.551943
180,0.1785,0.251915,69715.0,2.977404,8.194795,14.296875,0.887931,4.303096
200,0.1695,0.263562,77707.0,3.529924,9.280966,15.392241,0.905172,4.782556


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!




In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_70m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-90


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['ppo_pythia_70m_config']
ppo_args['output_dir'] = "./pythia-70m-deduped-PPO-80-20"
ppo_pythia_70m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['pythia_70m_reward_config']
reward_args['output_dir'] = "./pythia-70m-reward-model-80-20"
reward_pythia_70m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-70m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_70m.tokenizer.pad_token_id
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_70m.model, 
    reward_model_base="EleutherAI/pythia-70m-deduped", 
    reward_model_config=reward_pythia_70m_config,
    value_model=value_model, 
    processing_class=pythia_70m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_70m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 13356.06 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 10477.52 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...


Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7467,0.619761,7468.0,8.817888,9.957435,11.193966,0.590517,0.326239
40,0.6759,0.596121,15118.0,9.233836,10.250404,11.337284,0.633621,0.340787
60,0.9142,0.542901,23396.0,8.141164,9.711678,11.496767,0.650862,0.664197
80,0.4997,0.493758,31512.0,7.864763,9.663995,11.882543,0.75431,0.958109
100,0.5629,0.459744,39131.0,5.838901,8.616379,11.990302,0.784483,1.584321
120,1.2588,0.433024,46204.0,7.618534,9.616783,12.001078,0.810345,1.206897
140,0.5242,0.44533,54080.0,9.365841,10.743467,12.084052,0.844828,0.805092
160,0.5775,0.382043,61840.0,8.545259,10.434469,12.590517,0.853448,1.257139
180,0.8136,0.368779,69715.0,8.705819,10.584456,12.802802,0.862069,1.287446
200,0.6211,0.443403,77707.0,10.413254,11.739022,13.043103,0.862069,0.803475


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===




Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_70m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-50


# Bias Label (50% accurate, 50% Bias)

In [12]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['ppo_pythia_70m_config']
ppo_args['output_dir'] = "./pythia-70m-deduped-PPO-50-50"
ppo_pythia_70m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['pythia_70m_reward_config']
reward_args['output_dir'] = "./pythia-70m-reward-model-50-50"
reward_pythia_70m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-70m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_70m.tokenizer.pad_token_id
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_70m.model, 
    reward_model_base="EleutherAI/pythia-70m-deduped", 
    reward_model_config=reward_pythia_70m_config,
    value_model=value_model, 
    processing_class=pythia_70m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_70m_config
)

ppo_trainer.train()

print(f"Best checkpoint: {ppo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {ppo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 18689.16 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 14287.77 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.6881,0.899333,7468.0,9.948276,10.896013,11.743534,0.357759,-0.267241
40,0.7665,0.728792,15118.0,9.442888,10.141433,10.826509,0.452586,-0.014547
60,0.7479,0.79436,23396.0,9.701509,10.418373,11.068966,0.357759,-0.147091
80,0.7412,0.706402,31512.0,9.056034,9.938578,10.724138,0.491379,0.039871
100,0.7473,0.651737,39131.0,8.588362,9.633082,10.804957,0.538793,0.234106
120,0.8454,0.72744,46204.0,9.757543,10.309941,10.87931,0.422414,-0.032597
140,0.6976,0.78707,54080.0,10.15625,10.702586,11.313578,0.375,-0.130388
160,0.8437,0.650862,61840.0,8.932112,9.640894,10.446121,0.551724,0.155172
180,0.7841,0.758895,69715.0,10.169181,10.795124,11.427802,0.413793,-0.062769
200,0.7867,0.947539,77707.0,10.435345,11.743265,12.594828,0.37931,-0.311961


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===




Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.
Best checkpoint: None
Best metric: None


In [None]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_70m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-20


: 