In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-31m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_31m.model is not None)
print("Tokenizer loaded:", pythia_31m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_pythia_31m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_pythia_31m_config = RewardConfig(**reward_args)

In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Tokenizing train dataset: 100%|██████████| 801/801 [00:00<00:00, 2829.39 examples/s]
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 15223.68 examples/s]
Tokenizing eval dataset: 100%|██████████| 229/229 [00:00<00:00, 2668.88 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 13885.21 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...




Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7452,0.587922,7468.0,4.579472,7.01677,10.096983,0.612069,1.110048
40,0.6407,0.54927,15118.0,4.480065,6.962352,10.199892,0.698276,1.224003
60,0.5006,0.547133,23396.0,3.707435,6.45097,10.022629,0.698276,1.420124
80,0.6703,0.535038,31512.0,3.58944,6.414837,10.14278,0.741379,1.536705
100,0.4956,0.524172,39131.0,2.828394,5.978415,10.087823,0.75431,1.701307
120,0.6028,0.504543,46204.0,3.264547,6.257274,10.038793,0.793103,1.619747
140,0.6183,0.480136,54080.0,3.622037,6.454304,10.008082,0.793103,1.586409
160,0.4464,0.454242,61840.0,4.698545,7.061725,9.929957,0.818966,1.344491
180,0.4394,0.441818,69715.0,3.881466,6.672212,9.910022,0.818966,1.58055
200,0.3795,0.417507,77707.0,4.095905,6.89608,9.913254,0.840517,1.580145


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True


Map:   0%|          | 0/229 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 229/229 [00:00<00:00, 12649.25 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 35743.49 examples/s]


INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!


In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-1950


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_args['output_dir'] = "./pythia-31m-PPO-80-20"
ppo_pythia_31m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_args['output_dir'] = "./pythia-31m-reward-model-80-20"
reward_pythia_31m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Tokenizing train dataset: 100%|██████████| 801/801 [00:00<00:00, 2603.18 examples/s]
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 17038.26 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 12833.47 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7345,0.561776,7468.0,-11.413793,-10.327182,-9.099138,0.650862,0.443157
40,0.6698,0.557979,15118.0,-11.546336,-10.431573,-9.265086,0.659483,0.426724
60,0.8731,0.520617,23396.0,-13.22306,-11.423761,-9.386853,0.659483,0.798491
80,0.4731,0.519751,31512.0,-14.345905,-12.103583,-9.514009,0.689655,1.030442
100,0.5595,0.534659,39131.0,-14.954741,-12.47535,-9.579741,0.711207,1.123653
120,1.0683,0.497437,46204.0,-13.03125,-11.422279,-9.518319,0.758621,0.783136
140,0.6258,0.561498,54080.0,-11.470905,-10.499596,-9.482759,0.741379,0.412446
160,0.556,0.515735,61840.0,-11.52694,-10.414062,-9.22306,0.775862,0.53556
180,0.7807,0.47892,69715.0,-14.100216,-11.810008,-8.990841,0.801724,1.245824
200,0.7333,0.504669,77707.0,-12.043103,-10.747306,-9.224138,0.780172,0.692349


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True


Map:   0%|          | 0/801 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 801/801 [00:00<00:00, 33419.25 examples/s]

INFO - src.training.rlhf_trainer - Initializing PPOTrainer...





INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-0


# Bias Label (50% accurate, 50% Bias)

In [3]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_args['output_dir'] = "./pythia-31m-PPO-50-50"
ppo_pythia_31m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_args['output_dir'] = "./pythia-31m-reward-model-50-50"
reward_pythia_31m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id

ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

ppo_trainer.train()

print(f"Best checkpoint: {ppo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {ppo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 16749.95 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 13668.64 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.8254,1.169721,7468.0,-14.15625,-12.575296,-11.075431,0.323276,-0.577856
40,0.7066,0.691318,15118.0,-15.193966,-14.266703,-13.354526,0.49569,0.113685
60,0.7679,0.689051,23396.0,-15.220905,-14.482085,-13.727371,0.512931,0.080011
80,0.7483,0.616051,31512.0,-16.747845,-15.487204,-14.009698,0.590517,0.374731
100,0.6463,0.666865,39131.0,-16.200431,-15.228314,-14.052802,0.517241,0.184537
120,0.7085,0.737071,46204.0,-15.262931,-14.620555,-13.971983,0.426724,-0.029903
140,0.6818,0.701506,54080.0,-15.070043,-14.420797,-13.670259,0.456897,0.04903
160,0.8329,0.61783,61840.0,-15.510776,-14.620959,-13.536638,0.534483,0.308728
180,0.8329,0.645482,69715.0,-15.497845,-14.625404,-13.544181,0.521552,0.20986
200,0.694,0.670442,77707.0,-14.876078,-14.197872,-13.460129,0.538793,0.121498


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===




Step,Training Loss,Validation Loss




INFO - src.training.rlhf_trainer - PPO training complete.
Best checkpoint: None
Best metric: None


In [5]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-50
