In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-31m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_31m.model is not None)
print("Tokenizer loaded:", pythia_31m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_pythia_31m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_pythia_31m_config = RewardConfig(**reward_args)

In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 7362.96 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 7505.22 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.6809,0.554146,7468.0,-20.9375,-18.517107,-15.448276,0.62931,1.161369
40,0.5897,0.529896,15118.0,-21.181034,-18.606008,-15.459052,0.637931,1.289601
60,0.4981,0.525854,23396.0,-21.5625,-18.824892,-15.50431,0.711207,1.352371
80,0.6039,0.496098,31512.0,-21.797414,-18.849003,-15.227371,0.728448,1.558459
100,0.5693,0.460173,39131.0,-22.491379,-19.190463,-15.261853,0.775862,1.819504
120,0.5877,0.452064,46204.0,-22.622845,-19.306573,-15.198276,0.814655,1.956897
140,0.4382,0.422495,54080.0,-23.601293,-19.846579,-15.101293,0.836207,2.364494
160,0.2582,0.37754,61840.0,-24.74569,-20.26549,-14.913793,0.840517,2.890894
180,0.3994,0.353612,69715.0,-23.556034,-19.403556,-14.644397,0.875,2.768858
200,0.2629,0.335897,77707.0,-22.762931,-18.920528,-14.557112,0.857759,2.740841


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!




In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-1760


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_args['output_dir'] = "./pythia-31m-PPO-80-20"
ppo_pythia_31m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_args['output_dir'] = "./pythia-31m-reward-model-80-20"
reward_pythia_31m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 6046.10 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 11247.02 examples/s]


INFO - src.training.rlhf_trainer - Training reward model...


Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.7594,0.583459,7468.0,-6.643319,-5.47205,-4.237877,0.594828,0.423626
40,0.6324,0.596763,15118.0,-6.31681,-5.284853,-4.148438,0.616379,0.364291
60,0.8359,0.569677,23396.0,-8.903017,-6.79445,-4.402209,0.62931,0.847791
80,0.5224,0.554927,31512.0,-9.190733,-6.898673,-4.140894,0.698276,1.032126
100,0.5887,0.539439,39131.0,-10.146552,-7.495319,-4.189386,0.702586,1.335466
120,1.1934,0.542627,46204.0,-8.253233,-6.404735,-4.068427,0.698276,0.924502
140,0.7395,0.552649,54080.0,-6.649784,-5.509665,-4.060614,0.711207,0.56149
160,0.6228,0.559034,61840.0,-6.41056,-5.406351,-4.185884,0.711207,0.479593
180,0.6281,0.532236,69715.0,-7.846444,-6.20885,-4.127155,0.732759,0.845501
200,0.7027,0.518005,77707.0,-7.046336,-5.787749,-4.212284,0.758621,0.689992


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===




Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-10


# Bias Label (50% accurate, 50% Bias)

In [12]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_args['output_dir'] = "./pythia-31m-PPO-50-50"
ppo_pythia_31m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_args['output_dir'] = "./pythia-31m-reward-model-50-50"
reward_pythia_31m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

ppo_trainer.train()

print(f"Best checkpoint: {ppo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {ppo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Filtering train >1024 tokens: 100%|██████████| 801/801 [00:00<00:00, 14924.36 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 229/229 [00:00<00:00, 12765.42 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...





Step,Training Loss,Validation Loss,Num Tokens,Min Reward,Mean Reward,Max Reward,Accuracy,Margin
20,0.6992,1.033699,7468.0,-4.969289,-3.601933,-2.396282,0.362069,-0.414062
40,0.6971,0.624309,15118.0,-6.559267,-5.487641,-4.422953,0.560345,0.26677
60,0.7998,0.641864,23396.0,-6.380388,-5.477573,-4.553071,0.568966,0.194504
80,0.6749,0.668467,31512.0,-6.351293,-5.475317,-4.558459,0.517241,0.147562
100,0.683,0.634772,39131.0,-6.943966,-5.881364,-4.718481,0.577586,0.272158
120,0.7455,0.712887,46204.0,-5.989224,-5.28583,-4.585668,0.491379,0.01118
140,0.8299,0.841528,54080.0,-5.732759,-4.881263,-4.157866,0.37931,-0.177263
160,0.7231,0.751483,61840.0,-6.108297,-5.323916,-4.537985,0.487069,-0.029566
180,0.9258,0.588386,69715.0,-7.545797,-6.211308,-4.682381,0.62069,0.446996
200,0.7761,0.752027,77707.0,-5.858297,-5.219188,-4.548491,0.456897,-0.055967


INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===




Step,Training Loss,Validation Loss


INFO - src.training.rlhf_trainer - PPO training complete.
Best checkpoint: None
Best metric: None


In [None]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-10


: 