In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Bias Label (100% accurate)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_pythia_160m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

Map: 100%|██████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 14517.93 examples/s]
Map: 100%|██████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 10663.52 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.6425,0.587411,0.318673,-0.418941,0.678448,0.737614,-870.222107,-856.893311,745.799072,747.019104
40,0.4157,0.433502,0.774791,-0.564834,0.788793,1.339625,-865.66095,-858.352234,742.743042,744.471619
60,0.3148,0.299501,1.129104,-0.813065,0.900862,1.942169,-862.117798,-860.834534,739.534058,741.77356
80,0.1296,0.240661,1.261912,-1.298317,0.905172,2.560229,-860.789673,-865.686951,736.914856,739.446228
100,0.1582,0.209025,1.464044,-1.661624,0.939655,3.125669,-858.768372,-869.320129,733.011292,736.066406
120,0.0818,0.185487,1.911562,-2.185677,0.948276,4.097239,-854.293152,-874.560608,726.586731,730.505371
140,0.279,0.168566,2.614894,-2.801761,0.943965,5.416656,-847.259888,-880.721436,717.864319,722.884155
160,0.0066,0.140466,4.102577,-2.748818,0.961207,6.851395,-832.383057,-880.192078,706.763489,712.531372
180,0.0089,0.147551,6.425293,-1.729778,0.969828,8.155071,-809.155884,-870.001587,694.640137,701.592651
200,0.0341,0.148238,9.871446,-0.800273,0.965517,10.671717,-774.694397,-860.706604,679.617188,688.763489


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-160m-deduped-DPO/checkpoint-900
Best metric: 0.13579154014587402


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-80-20"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 31521.62 examples/s]
Map: 100%|██████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 15122.34 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.82,0.640534,0.206622,-0.284937,0.625,0.491559,-871.342651,-855.553223,745.951538,746.950684
40,0.8869,0.628298,0.132152,-0.369774,0.648276,0.501926,-872.08728,-856.401611,744.236084,745.410583
60,0.6745,0.546556,0.337367,-0.451113,0.734483,0.78848,-870.035156,-857.214966,742.83667,744.205933
80,0.4862,0.487782,0.446712,-0.683129,0.771552,1.129841,-868.941589,-859.535156,741.845032,743.262878
100,0.3545,0.45368,0.487912,-0.841843,0.784483,1.329755,-868.529663,-861.122314,740.673584,742.231934
120,1.1765,0.433555,0.501782,-0.876022,0.793103,1.377805,-868.391052,-861.464111,740.017944,741.706238
140,0.3859,0.41984,0.514305,-0.886045,0.806035,1.40035,-868.265747,-861.56427,739.867432,741.630127
160,0.4621,0.367913,0.824844,-0.780111,0.836207,1.604955,-865.1604,-860.505005,739.016724,740.893005
180,0.7734,0.324384,0.84094,-0.916292,0.87069,1.757232,-864.999512,-861.86676,739.288879,741.070923
200,0.7765,0.350993,0.835756,-0.934139,0.853448,1.769895,-865.05127,-862.045288,739.030334,740.874207


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-80-20/checkpoint-940
Best metric: 0.23873135447502136


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-50-50"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 22435.57 examples/s]
Map: 100%|██████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 11311.39 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.6737,1.099412,-0.374215,0.055202,0.342241,-0.429417,-877.151001,-852.151794,747.46167,748.257996
40,0.7386,1.147104,-0.429645,0.035698,0.387069,-0.465343,-877.705261,-852.346863,747.101807,747.924683
60,0.4995,0.921367,-0.369809,-0.266042,0.488793,-0.103767,-877.106873,-855.364197,746.438049,747.320679
80,0.9904,0.906524,-0.388421,-0.296305,0.475862,-0.092115,-877.292969,-855.66687,746.627563,747.475952
100,0.9105,0.994872,-0.49523,-0.238955,0.464655,-0.256275,-878.361145,-855.093384,747.121643,747.993225
120,0.6153,1.022771,-0.510234,-0.236353,0.427586,-0.273881,-878.511169,-855.067444,746.985046,747.855225
140,0.9519,0.932418,-0.42869,-0.331416,0.471552,-0.097273,-877.695679,-856.018066,746.817932,747.693542
160,1.0258,0.916742,-0.63558,-0.536407,0.507759,-0.099173,-879.764526,-858.067932,747.026489,747.955322
180,1.1251,0.827185,-0.55973,-0.720479,0.555172,0.160749,-879.006226,-859.908691,746.749512,747.756714
200,1.0016,0.857875,-0.609841,-0.680603,0.551724,0.070762,-879.507263,-859.509888,746.51416,747.43927


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-50-50/checkpoint-180
Best metric: 0.8271852135658264


In [14]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv
