In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_pythia_160m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_pythia_160m_config = RewardConfig(**reward_args)

In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|█████████████████████████████████| 801/801 [00:00<00:00, 13230.31 examples/s]
Filtering eval >1024 tokens: 100%|██████████████████████████████████| 229/229 [00:00<00:00, 10599.51 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...




Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.6229,0.480333,7468.0,-1.621952,1.58603,5.417969,0.827586,1.84705
40,0.3072,0.310993,15118.0,0.219929,3.280487,6.292834,0.905172,2.101793
60,0.1815,0.242637,23396.0,-2.510319,2.088904,6.417026,0.956897,3.325466
80,0.2572,0.164939,31512.0,-2.814756,3.882645,9.146013,0.961207,5.305443
100,0.027,0.321683,39131.0,-27.28556,-8.264739,10.0,0.926724,13.897339
120,0.3004,0.387446,46204.0,-25.794181,-6.037261,14.761584,0.918103,16.741539
140,0.3071,0.289575,54080.0,-19.637392,-0.806195,18.116918,0.961207,17.680951
160,0.0025,0.282243,61840.0,-16.762662,2.347212,20.858836,0.961207,18.374972
180,0.0113,0.274037,69715.0,-14.280846,3.66833,21.5,0.969828,17.162404
200,0.1322,0.341877,77707.0,-18.323815,-0.888119,18.325869,0.956897,16.414582


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!


In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-2220


# Bias Label (80% accurate, 20% Bias)

In [None]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_args['output_dir'] = "./pythia-160m-deduped-PPO-80-20"
ppo_pythia_160m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_args['output_dir'] = "./pythia-160m-reward-model-80-20"
reward_pythia_160m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|█████████████████████████████████| 801/801 [00:00<00:00, 13107.46 examples/s]
Filtering eval >1024 tokens: 100%|██████████████████████████████████| 229/229 [00:00<00:00, 10675.85 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...


Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7454,0.537784,7468.0,-5.429418,-4.488147,-3.577317,0.767241,0.425108
40,0.6361,0.426742,15118.0,-6.67403,-4.99305,-2.953962,0.831897,0.95167
60,0.9559,0.354531,23396.0,-7.489224,-5.424022,-3.032126,0.922414,1.361648
80,0.4245,0.317323,31512.0,-10.024784,-6.69836,-3.004984,0.918103,2.234207
100,0.4082,0.274569,39131.0,-7.821121,-5.307499,-2.628637,0.931034,1.920831
120,0.8762,0.736408,46204.0,-3.732759,-1.597885,0.522166,0.706897,0.61899
140,0.8524,0.360889,54080.0,-4.646013,-3.143534,-1.242322,0.909483,1.161949
160,0.4901,0.32615,61840.0,-5.517241,-3.536398,-1.565093,0.926724,1.482834
180,0.7753,0.312753,69715.0,-7.007543,-4.735373,-1.715153,0.905172,1.955708
200,0.5562,0.392284,77707.0,-5.661369,-3.179682,-1.378595,0.892241,1.206907


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


In [None]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")

# Bias Label (50% accurate, 50% Bias)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_args['output_dir'] = "./pythia-160m-deduped-PPO-50-50"
ppo_pythia_160m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_args['output_dir'] = "./pythia-160m-reward-model-50-50"
reward_args['per_device_train_batch_size'] = 8
reward_pythia_160m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|█████████████████████████████████| 801/801 [00:00<00:00, 20922.28 examples/s]
Filtering eval >1024 tokens: 100%|██████████████████████████████████| 229/229 [00:00<00:00, 15832.26 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7307,0.613687,15118.0,-2.299569,-0.981261,0.545468,0.534483,0.394507
40,0.7482,0.662421,31512.0,-1.502896,-0.671434,0.228521,0.543103,0.12595
60,0.7493,0.712304,46204.0,-0.716477,-0.247308,0.240619,0.461207,-0.010828
80,0.7255,0.647516,61840.0,-1.646686,-0.764335,0.087003,0.564655,0.162885
100,0.7117,0.920127,77707.0,0.134036,0.923775,1.672279,0.353448,-0.316372
120,0.7063,0.916849,92969.0,0.078842,0.936334,1.76805,0.400862,-0.294161
140,0.7197,0.626594,107874.0,-0.828596,-0.136228,0.515678,0.599138,0.186133
160,0.6598,0.651206,123773.0,-0.769405,-0.153805,0.422199,0.607759,0.120937
180,0.6997,0.731423,139516.0,-0.449532,-0.001893,0.473145,0.422414,-0.045346
200,0.7118,0.788807,154856.0,-0.271076,0.267751,0.869242,0.400862,-0.128403


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [4]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-0
