In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-70m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_70m.model is not None)
print("Tokenizer loaded:", pythia_70m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['ppo_pythia_70m_config']
ppo_pythia_70m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['pythia_70m_reward_config']
reward_pythia_70m_config = RewardConfig(**reward_args)

In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-70m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_70m.tokenizer.pad_token_id
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_70m.model, 
    reward_model_base="EleutherAI/pythia-70m-deduped", 
    reward_model_config=reward_pythia_70m_config,
    value_model=value_model, 
    processing_class=pythia_70m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_70m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 10900.69 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 13881.40 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...




Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.6793,0.582491,7468.0,-8.164332,-4.773103,-0.466147,0.637931,1.748655
40,0.5753,0.530673,15118.0,-8.339978,-4.815157,-0.362345,0.715517,1.92408
60,0.4914,0.49274,23396.0,-8.970905,-5.257185,-0.635338,0.788793,2.025004
80,0.5059,0.414932,31512.0,-7.908405,-4.490231,-0.454484,0.831897,1.989463
100,0.3675,0.34032,39131.0,-5.992726,-3.317778,-0.225906,0.87069,1.765309
120,0.2832,0.364932,46204.0,-6.705819,-3.627011,-0.073303,0.862069,2.011471
140,0.3655,0.295716,54080.0,-6.530711,-3.572281,-0.034206,0.905172,2.28099
160,0.1995,0.276782,61840.0,-6.721983,-3.333498,0.515053,0.887931,2.605098
180,0.1028,0.214091,69715.0,-7.715517,-3.29563,1.27007,0.922414,3.62515
200,0.245,0.205852,77707.0,-14.648707,-6.633915,1.5641,0.939655,6.248618


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!




In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_70m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-1180


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['ppo_pythia_70m_config']
ppo_args['output_dir'] = "./pythia-70m-deduped-PPO-80-20"
ppo_pythia_70m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['pythia_70m_reward_config']
reward_args['output_dir'] = "./pythia-70m-reward-model-80-20"
reward_pythia_70m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-70m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_70m.tokenizer.pad_token_id
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_70m.model, 
    reward_model_base="EleutherAI/pythia-70m-deduped", 
    reward_model_config=reward_pythia_70m_config,
    value_model=value_model, 
    processing_class=pythia_70m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_70m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 7464.45 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 4226.16 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7542,0.636018,7468.0,-0.485396,0.273587,1.116076,0.633621,0.197405
40,0.5825,0.486355,15118.0,-2.735991,-1.0208,0.94587,0.784483,0.863149
60,0.8981,0.501671,23396.0,-1.024321,0.035332,1.083513,0.806034,0.555167
80,0.5769,0.581507,31512.0,-4.168642,-2.236462,1.008621,0.758621,1.167967
100,0.5483,0.516125,39131.0,-6.536638,-4.864969,-1.523623,0.831897,1.068502
120,0.7269,0.459644,46204.0,-5.107489,-3.680821,-1.481052,0.827586,0.902069
140,0.4925,0.413679,54080.0,-4.899515,-3.171361,-0.459961,0.849138,1.18068
160,0.4393,0.385188,61840.0,-5.379041,-3.128175,0.627719,0.857759,1.736638
180,0.9108,0.358607,69715.0,-3.494073,-1.671387,1.15336,0.887931,1.463463
200,0.8898,0.379491,77707.0,-2.790409,-1.120922,1.458111,0.862069,1.31546


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_70m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-10


# Bias Label (50% accurate, 50% Bias)

In [3]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['ppo_pythia_70m_config']
ppo_args['output_dir'] = "./pythia-70m-deduped-PPO-50-50"
ppo_pythia_70m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['pythia_70m_reward_config']
reward_args['output_dir'] = "./pythia-70m-reward-model-50-50"
reward_pythia_70m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-70m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_70m.tokenizer.pad_token_id
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_70m.model, 
    reward_model_base="EleutherAI/pythia-70m-deduped", 
    reward_model_config=reward_pythia_70m_config,
    value_model=value_model, 
    processing_class=pythia_70m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_70m_config
)

ppo_trainer.train()

print(f"Best checkpoint: {ppo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {ppo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 7994.40 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 10606.41 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...




Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.8828,0.851665,7468.0,6.58028,7.438847,8.283944,0.396552,-0.19208
40,0.8174,0.799746,15118.0,6.351293,7.021821,7.755927,0.37069,-0.135506
60,0.7324,0.856948,23396.0,6.781789,7.599138,8.415409,0.318966,-0.209052
80,0.7891,0.675933,31512.0,6.167565,6.946525,7.882543,0.551724,0.140086
100,0.6686,0.650138,39131.0,5.452047,6.421404,8.04472,0.607759,0.371363
120,0.8317,0.831461,46204.0,7.212823,8.235385,9.012931,0.37069,-0.155846
140,0.7906,0.61972,54080.0,6.508621,7.154836,7.928341,0.62931,0.230199
160,0.8058,0.695982,61840.0,6.403017,6.967201,7.601832,0.534483,0.04701
180,0.8122,0.651737,69715.0,5.614763,6.563241,7.613685,0.556034,0.22535
200,0.7806,0.967789,77707.0,6.733836,7.780779,8.665948,0.327586,-0.354122


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.
Best checkpoint: None
Best metric: None


In [None]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_70m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-0


: 