In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-31m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Bias Label (100% accurate)

In [3]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_31m.model is not None)
print("Tokenizer loaded:", pythia_31m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_pythia_31m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 18506.63 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 14480.56 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.0018,0.896379,-0.240499,-0.637988,0.591379,0.397489,-217.519882,-180.513718,1975.396606,1976.012451
20,0.8924,0.787757,-0.036301,-0.661493,0.598276,0.625192,-215.477905,-180.748764,1974.828369,1975.50354
30,0.5222,0.756704,-0.021138,-0.879969,0.661207,0.85883,-215.326294,-182.933502,1974.331665,1975.058838
40,0.6233,0.694403,0.076316,-1.085683,0.672414,1.161999,-214.351746,-184.990677,1974.026245,1974.843872
50,0.7792,0.642511,0.200109,-1.088267,0.69569,1.288376,-213.113815,-185.01651,1973.707886,1974.561523
60,0.6761,0.561643,0.237384,-1.307912,0.742241,1.545296,-212.741043,-187.212982,1973.597168,1974.505371
70,0.248,0.594869,0.178583,-1.523329,0.710345,1.701913,-213.329086,-189.367142,1973.416504,1974.349121
80,0.739,0.550053,0.102803,-1.628737,0.721552,1.73154,-214.086884,-190.421219,1973.252441,1974.218506
90,0.4314,0.480472,0.259991,-1.851923,0.771552,2.111915,-212.515015,-192.653061,1972.809814,1973.777832
100,0.6049,0.573584,-0.030193,-2.139709,0.75431,2.109517,-215.416824,-195.53093,1972.362427,1973.416138


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-31m-DPO/checkpoint-760
Best metric: 0.1919344663619995


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_args['output_dir'] = "./pythia-31m-DPO-80-20"
dpo_pythia_31m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 7821.24 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 2633.65 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.6865,0.966367,-0.192099,-0.443974,0.586207,0.251875,-217.035904,-178.573578,1975.403564,1976.011841
20,0.9007,0.787674,0.087326,-0.560581,0.672414,0.647908,-214.241653,-179.739655,1975.008179,1975.675659
30,0.7755,0.845069,0.033093,-0.488974,0.612069,0.522067,-214.783966,-179.02359,1974.690674,1975.418213
40,1.0155,0.863267,-0.068255,-0.668646,0.631034,0.600392,-215.797485,-180.820312,1974.551514,1975.285156
50,0.9989,0.74384,0.206543,-0.644155,0.656897,0.850698,-213.049469,-180.575394,1974.368896,1975.11731
60,1.0203,0.750176,0.310162,-0.659204,0.674138,0.969367,-212.013306,-180.725906,1974.192749,1974.996826
70,0.9871,0.888742,-0.199238,-0.855659,0.612069,0.656422,-217.107285,-182.690445,1974.033569,1974.821045
80,1.1846,0.865832,-0.218847,-0.95796,0.600862,0.739113,-217.303406,-183.713425,1973.864746,1974.708252
90,0.96,0.835238,-0.120058,-1.058187,0.598276,0.938129,-216.315491,-184.715698,1973.745239,1974.61438
100,0.7106,0.709903,0.043719,-1.273094,0.682759,1.316813,-214.677719,-186.864761,1973.671753,1974.555664


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-31m-DPO-80-20/checkpoint-670
Best metric: 0.32171663641929626


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_31m_training_logs_80_20.csv")


Logs saved to dpo_31m_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_args['output_dir'] = "./pythia-31m-DPO-50-50"
dpo_pythia_31m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 9674.51 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 3102.50 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.2018,1.19973,-0.600942,-0.447276,0.460345,-0.153667,-221.124344,-178.606613,1976.100586,1976.561157
20,1.3314,1.27682,-0.857792,-0.461261,0.438793,-0.396531,-223.692825,-178.746445,1976.1604,1976.607178
30,0.9533,1.304948,-0.940492,-0.465302,0.401724,-0.47519,-224.519806,-178.78688,1976.044678,1976.544678
40,1.2391,1.427286,-0.965691,-0.403994,0.412931,-0.561697,-224.771805,-178.173782,1976.218872,1976.702393
50,0.4546,1.314819,-0.881015,-0.442343,0.423276,-0.438672,-223.925079,-178.557281,1976.193481,1976.675659
60,1.1169,1.307753,-0.982094,-0.583539,0.443103,-0.398555,-224.935852,-179.969223,1976.096436,1976.635986
70,0.819,1.245815,-0.965885,-0.645756,0.451724,-0.320129,-224.773758,-180.5914,1976.314575,1976.793701
80,0.8093,0.99359,-0.638582,-0.748935,0.553448,0.110354,-221.500717,-181.623184,1976.262451,1976.732178
90,1.3711,1.102405,-0.885212,-0.816264,0.494828,-0.068947,-223.96701,-182.296478,1976.124878,1976.614868
100,1.1164,1.259475,-1.110361,-0.81422,0.456034,-0.296141,-226.218536,-182.276047,1976.303467,1976.781006


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-31m-DPO-50-50/checkpoint-350
Best metric: 0.8990842700004578


In [14]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv
