In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_pythia_160m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_pythia_160m_config = RewardConfig(**reward_args)

In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 6047.52 examples/s]
Filtering eval >1024 tokens: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 3208.81 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...




Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7089,0.548836,7468.0,-11.453664,-4.689453,3.921606,0.801724,4.01219
40,0.2506,0.419196,15118.0,-11.563578,-4.286787,4.397629,0.862069,4.586986
60,0.1662,0.356481,23396.0,-13.470905,-4.97193,4.781519,0.896552,5.612915
80,0.5096,0.294829,31512.0,-13.086207,-4.007834,5.67861,0.918103,6.343371
100,0.0706,0.194041,39131.0,-13.537716,-3.089119,7.088328,0.926724,7.933183
120,0.0423,0.134959,46204.0,-9.903489,-0.124902,9.205011,0.965517,8.218155
140,0.2091,0.161434,54080.0,-10.28583,-1.221878,7.344289,0.948276,7.713741
160,0.0162,0.396092,61840.0,-25.938578,-6.759519,13.200237,0.931034,14.873287
180,0.6512,0.273453,69715.0,-18.391703,-1.295704,15.944504,0.956897,14.934639
200,0.3539,0.257219,77707.0,-29.326509,-8.329236,15.010304,0.965517,19.847858


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!


In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-2290


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_args['output_dir'] = "./pythia-160m-deduped-PPO-80-20"
ppo_pythia_160m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_args['output_dir'] = "./pythia-160m-reward-model-80-20"
reward_pythia_160m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 6683.35 examples/s]
Filtering eval >1024 tokens: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 2535.95 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...


Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7744,0.722108,7468.0,-7.453125,-6.777546,-6.145474,0.543103,0.025458
40,0.6308,0.468839,15118.0,-10.559267,-8.645003,-6.240841,0.771552,1.115706
60,0.7746,0.849855,23396.0,-7.471983,-6.214844,-4.914871,0.586207,-0.005927
80,0.4113,0.439086,31512.0,-16.239224,-11.717268,-5.841595,0.853448,2.752694
100,0.3967,0.346631,39131.0,-15.092672,-11.207368,-6.207974,0.900862,2.651536
120,1.1496,0.729709,46204.0,-10.623922,-8.613416,-6.681034,0.702586,0.561961
140,0.6412,0.323441,54080.0,-10.330819,-8.825768,-7.447737,0.935345,1.180361
160,0.4867,0.261393,61840.0,-9.177263,-7.186591,-5.256196,0.935345,1.675579
180,0.7356,0.277532,69715.0,-10.237069,-7.97077,-5.583513,0.887931,1.839978
200,0.7091,0.415019,77707.0,-7.968211,-6.252357,-4.713362,0.827586,1.005523


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-2310


# Bias Label (50% accurate, 50% Bias)

In [None]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_args['output_dir'] = "./pythia-160m-deduped-PPO-50-50"
ppo_pythia_160m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_args['output_dir'] = "./pythia-160m-reward-model-50-50"
reward_args['per_device_train_batch_size'] = 8
reward_pythia_160m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 18199.06 examples/s]
Filtering eval >1024 tokens: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 13054.29 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...


Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7546,0.775056,15118.0,-7.584052,-6.988079,-6.363685,0.37931,-0.108971
40,0.7763,0.861856,31512.0,-7.275862,-6.616581,-5.877155,0.344828,-0.231277
60,0.7429,0.777325,46204.0,-7.363147,-6.707839,-5.913793,0.478448,-0.069504
80,0.7744,0.629044,61840.0,-8.012931,-7.263605,-6.574353,0.603448,0.205011
100,0.7336,0.892866,77707.0,-7.177263,-6.290073,-5.349677,0.409483,-0.232893
120,0.6978,0.77313,92969.0,-6.901401,-6.262055,-5.550108,0.431034,-0.053206
140,0.7359,0.907912,107874.0,-6.837823,-5.880321,-4.911099,0.426724,-0.245555
160,0.6917,0.630951,123773.0,-7.824353,-6.829809,-5.668103,0.560345,0.280038
180,0.7175,0.797837,139516.0,-7.132543,-6.491312,-5.860991,0.383621,-0.128637
200,0.7031,0.708222,154856.0,-7.547953,-6.991177,-6.412177,0.487069,0.031385


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


In [None]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")