In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.rlhf_trainer import RLHF_PPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import PPOConfig, RewardConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="ppo_beta0.1_bias20_run1",     
    config={
        "kl_coef": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-31m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

# Bias Label (100% accurate, 0% Bias)

In [3]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_31m.model is not None)
print("Tokenizer loaded:", pythia_31m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_pythia_31m_config = PPOConfig(**ppo_args)

reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_pythia_31m_config = RewardConfig(**reward_args)

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


In [7]:
from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Training reward model...


You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
20,0.5596,0.530613,0.797235
40,0.3555,0.435818,0.888393
60,0.3873,0.417685,0.870536
80,0.4136,0.395338,0.864035
100,0.1592,0.415291,0.881057
120,0.3045,0.417624,0.89083
140,0.7076,0.461927,0.90393
160,0.1585,0.395093,0.916667
180,0.3695,0.416252,0.917031
200,0.4443,0.39832,0.947368




INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True


Map:   0%|                                                                                                                                                                     | 0/801 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 21563.78 examples/s]


INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!


In [8]:
ppo_trainer.train()

INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


  rewards[[actual_start, actual_end]] += scores


Step,Training Loss,Validation Loss


  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores


INFO - src.training.rlhf_trainer - PPO training complete.


In [9]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_100_0.csv', index=False)
print("\nLogs saved to ppo_training_logs_100_0.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_100_0.csv
Best checkpoint: checkpoint-10


# Bias Label (80% accurate, 20% Bias)

In [10]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_args['output_dir'] = "./pythia-31m-PPO-80-20"
ppo_pythia_31m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_args['output_dir'] = "./pythia-31m-reward-model-80-20"
reward_pythia_31m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

ppo_trainer.train()

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.
Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Training reward model...


You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
20,0.7378,0.503951,0.772727
40,0.6291,0.493933,0.757991
60,0.7484,0.467774,0.816964
80,0.4393,0.474555,0.799087
100,0.5772,0.419561,0.859729
120,0.859,0.633427,0.656388
140,0.6599,0.370573,0.859031
160,0.5579,0.367897,0.881057
180,0.8206,0.372165,0.884956
200,0.7377,0.37108,0.880531




INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True
INFO - src.training.rlhf_trainer - Initializing PPOTrainer...
INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


  rewards[[actual_start, actual_end]] += scores


Step,Training Loss,Validation Loss


  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores


INFO - src.training.rlhf_trainer - PPO training complete.


In [11]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_80_20.csv', index=False)
print("\nLogs saved to ppo_training_logs_80_20.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_80_20.csv
Best checkpoint: checkpoint-10


# Bias Label (50% accurate, 50% Bias)

In [12]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

train_ds = bias_train_ds
valid_ds = bias_valid_ds

ppo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['ppo_pythia_31m_config']
ppo_args['output_dir'] = "./pythia-31m-PPO-50-50"
ppo_pythia_31m_config = PPOConfig(**ppo_args)


reward_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['pythia_31m_reward_config']
reward_args['output_dir'] = "./pythia-31m-reward-model-50-50"
reward_pythia_31m_config = RewardConfig(**reward_args)


from transformers import AutoModelForSequenceClassification
import torch
value_model = AutoModelForSequenceClassification.from_pretrained(
                "EleutherAI/pythia-31m",
                num_labels=1,
                )
value_model.config.pad_token_id = pythia_31m.tokenizer.pad_token_id
pythia_31m.tokenizer.pad_token = pythia_31m.tokenizer.eos_token
ppo_trainer = RLHF_PPO_Trainer(
    model=pythia_31m.model, 
    reward_model_base="EleutherAI/pythia-31m", 
    reward_model_config=reward_pythia_31m_config,
    value_model=value_model, 
    processing_class=pythia_31m.tokenizer, 
    train_dataset=train_ds, 
    valid_ds=valid_ds, 
    args=ppo_pythia_31m_config
)

ppo_trainer.train()

print(f"Best checkpoint: {ppo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {ppo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.
Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO - src.training.rlhf_trainer - Creating reward model from base...


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-31m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 25726.02 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 5008.53 examples/s]
Filter: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 13760.43 examples/s]

INFO - src.training.rlhf_trainer - Training reward model...



You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
20,0.7963,1.005703,0.435556
40,0.859,0.591217,0.704545
60,0.7065,0.607668,0.682243
80,0.7524,0.65576,0.60181
100,0.6697,0.744136,0.461538
120,0.7969,0.963023,0.351351
140,0.8422,1.059017,0.343891
160,0.7276,0.749645,0.439252
180,0.8037,0.611787,0.623853
200,0.6915,0.723826,0.472727




INFO - src.training.rlhf_trainer - Reward model training complete!
INFO - src.training.rlhf_trainer - Reward model type: <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForSequenceClassification'>
INFO - src.training.rlhf_trainer - Reward model has 'score' attribute: True


Map:   0%|                                                                                                                                                                     | 0/801 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 31009.27 examples/s]

INFO - src.training.rlhf_trainer - Initializing PPOTrainer...





INFO - src.training.rlhf_trainer - PPOTrainer initialized successfully!
INFO - src.training.rlhf_trainer - Starting PPO training...
===training policy===


  rewards[[actual_start, actual_end]] += scores


Step,Training Loss,Validation Loss


  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores
  rewards[[actual_start, actual_end]] += scores


INFO - src.training.rlhf_trainer - PPO training complete.
Best checkpoint: None
Best metric: None


In [13]:
import pandas as pd

state = ppo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('ppo_31m_training_logs_50_50.csv', index=False)
print("\nLogs saved to ppo_training_logs_50_50.csv")

best_checkpoint = f"checkpoint-{int(df['objective/rlhf_reward'].idxmax()) * 10}"
print(f"Best checkpoint: {best_checkpoint}")


Logs saved to ppo_training_logs_50_50.csv
Best checkpoint: checkpoint-0
