In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_pythia_160m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_pythia_160m_config = RewardConfig(**reward_args)

In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 6987.42 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 4664.77 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7003,0.525311,7468.0,-15.00431,-10.318292,-4.30361,0.797414,2.676994
40,0.2683,0.360094,15118.0,-13.988147,-9.102547,-3.272158,0.905172,3.1365
60,0.1434,0.281534,23396.0,-15.657328,-8.88796,-1.644492,0.905172,4.61383
80,0.438,0.176661,31512.0,-14.247845,-7.148034,-1.10115,0.956897,5.582029
100,0.0726,0.194878,39131.0,-28.011853,-13.828831,-0.648101,0.943966,10.635088
120,0.0259,0.223484,46204.0,-18.001078,-4.169737,10.7599,0.961207,12.227051
140,0.4384,0.231686,54080.0,-17.492996,-1.715646,15.723599,0.965517,14.201243
160,0.0053,0.158941,61840.0,-23.421336,-4.004924,17.105603,0.965517,17.745961
180,0.0005,0.195559,69715.0,-28.866379,-6.890763,16.879108,0.961207,20.212598
200,0.0511,0.229582,77707.0,-23.45097,-2.657695,20.6382,0.965517,20.192671


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!


In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-2150


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_args['output_dir'] = "./pythia-160m-deduped-PPO-80-20"
ppo_pythia_160m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_args['output_dir'] = "./pythia-160m-reward-model-80-20"
reward_pythia_160m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 9921.15 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 11727.09 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...


Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7589,0.810419,7468.0,-7.224138,-6.42733,-5.594828,0.448276,-0.104122
40,0.6069,0.508721,15118.0,-10.802802,-8.444841,-5.65625,0.737069,1.283809
60,0.8737,0.724804,23396.0,-6.362069,-5.573882,-4.60722,0.547414,0.081492
80,0.4191,0.476758,31512.0,-15.178879,-10.436894,-4.371228,0.840517,2.971713
100,0.4483,0.405567,39131.0,-14.887931,-10.08688,-4.116918,0.900862,3.125808
120,1.3331,0.553487,46204.0,-6.391164,-5.198983,-3.864763,0.74569,0.546269
140,0.8568,0.356746,54080.0,-8.6875,-6.790005,-4.732759,0.913793,1.319774
160,0.5267,0.35783,61840.0,-7.028556,-5.577957,-4.189116,0.961207,1.119949
180,0.8293,0.319876,69715.0,-10.282328,-7.34971,-3.76805,0.922414,2.21316
200,0.7054,0.512738,77707.0,-6.929418,-5.206156,-3.612877,0.767241,0.781923


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-250


# Bias Label (50% accurate, 50% Bias)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['ppo_pythia_160m_config']
ppo_args['output_dir'] = "./pythia-160m-deduped-PPO-50-50"
ppo_pythia_160m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['pythia_160m_reward_config']
reward_args['output_dir'] = "./pythia-160m-reward-model-50-50"
reward_args['per_device_train_batch_size'] = 8
reward_pythia_160m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-160m-deduped",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_160m.tokenizer.pad_token_id
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_160m.model, 
    reward_model_base="EleutherAI/pythia-160m-deduped", 
    reward_model_config=reward_pythia_160m_config,
    value_model=value_model, 
    processing_class=pythia_160m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_160m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 12406.57 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 10746.32 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...




Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7911,0.987304,15118.0,3.322468,4.384564,5.353987,0.344828,-0.358297
40,0.7855,0.886247,31512.0,3.206358,3.996094,4.702586,0.340517,-0.2566
60,0.7352,0.77484,46204.0,3.156519,3.695717,4.247575,0.418103,-0.105469
80,0.7417,0.692027,61840.0,2.847522,3.335904,3.797683,0.487069,0.030913
100,0.7408,1.023251,77707.0,3.344289,4.498384,5.525323,0.405172,-0.412985
120,0.7674,0.744827,92969.0,3.500539,4.009261,4.505927,0.448276,-0.057988
140,0.6799,0.690423,107874.0,3.411907,3.978751,4.560884,0.517241,0.043305
160,0.677,0.671154,123773.0,2.895744,3.491447,4.116649,0.538793,0.116379
180,0.6967,0.657694,139516.0,2.820851,3.37042,4.015625,0.560345,0.133486
200,0.6586,0.776521,154856.0,2.837015,3.457065,4.091056,0.431034,-0.09476


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...




INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 0}. If this is not desired, please set these values explicitly.


===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [None]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-0


: 