In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-70m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Bias Label (100% accurate)

In [3]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_70m.model is not None)
print("Tokenizer loaded:", pythia_70m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_pythia_70m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

Map: 100%|██████████| 801/801 [00:00<00:00, 17909.95 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 13428.43 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.773,0.795832,0.323907,-0.289037,0.585345,0.612944,-1427.960938,-1394.368896,1549.116089,1549.889526
20,0.7889,0.683983,0.558831,-0.481932,0.665517,1.040763,-1425.611816,-1396.297852,1548.660522,1549.539429
30,0.497,0.752577,0.627312,-0.301024,0.643965,0.928335,-1424.927002,-1394.48877,1548.324951,1549.268433
40,0.5891,0.689699,0.871419,-0.447124,0.69569,1.318542,-1422.485962,-1395.949585,1547.873169,1548.857544
50,0.7192,0.553754,1.262347,-0.568508,0.738793,1.830855,-1418.57666,-1397.163818,1547.250488,1548.327026
60,0.6283,0.556508,1.176533,-0.708723,0.773276,1.885256,-1419.434692,-1398.565674,1546.715576,1547.834106
70,0.5428,0.500678,1.278042,-0.875054,0.773276,2.153096,-1418.419678,-1400.229126,1546.239014,1547.42749
80,0.7474,0.527527,1.295642,-0.980348,0.77069,2.27599,-1418.243652,-1401.282104,1545.775513,1546.99231
90,0.6863,0.496731,1.402221,-1.058715,0.803448,2.460936,-1417.178101,-1402.065674,1545.199341,1546.534668
100,0.3145,0.497253,1.305234,-1.196981,0.787931,2.502216,-1418.147705,-1403.448364,1544.636475,1546.110474


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-70m-deduped-DPO/checkpoint-590
Best metric: 0.17000232636928558


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_args['output_dir'] = "./pythia-70m-deduped-DPO-80-20"
dpo_pythia_70m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 8190.20 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 2128.39 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.0988,0.820979,0.078545,-0.410474,0.587069,0.489018,-1430.414795,-1395.583374,1549.504028,1550.255127
20,1.1622,0.799181,0.092887,-0.406008,0.634483,0.498894,-1430.27124,-1395.538696,1549.225464,1549.998169
30,1.2294,0.860447,0.067789,-0.481303,0.575862,0.549091,-1430.522339,-1396.291504,1548.887451,1549.722656
40,0.5293,0.813734,0.392438,-0.36247,0.617241,0.754907,-1427.275757,-1395.103149,1548.424561,1549.307617
50,0.9683,0.757619,0.602218,-0.404802,0.632759,1.00702,-1425.178101,-1395.526489,1548.049683,1548.982178
60,1.0194,0.67766,0.648902,-0.486611,0.674138,1.135513,-1424.71106,-1396.344727,1547.813477,1548.798096
70,0.9831,0.636599,0.81409,-0.384858,0.667241,1.198948,-1423.059326,-1395.327148,1547.643188,1548.639404
80,0.8428,0.679936,0.75776,-0.46854,0.662931,1.2263,-1423.622559,-1396.16394,1547.373291,1548.386353
90,0.7716,0.637826,0.831143,-0.56064,0.710345,1.391783,-1422.888794,-1397.084961,1547.137085,1548.146851
100,0.7301,0.582058,0.941855,-0.756425,0.732759,1.69828,-1421.781494,-1399.042725,1546.775269,1547.848633


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-80-20/checkpoint-780
Best metric: 0.33185768127441406


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_args['output_dir'] = "./pythia-70m-deduped-DPO-50-50"
dpo_pythia_70m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 7321.64 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 3126.59 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.5721,0.973051,-0.14652,-0.318561,0.539655,0.172041,-1432.665283,-1394.664307,1549.682739,1550.376831
20,1.3025,1.157471,-0.477098,-0.287891,0.487931,-0.189207,-1435.971069,-1394.357544,1549.903076,1550.572021
30,0.9394,1.173578,-0.408571,-0.266675,0.47931,-0.141896,-1435.285889,-1394.145386,1549.956665,1550.602905
40,1.246,1.103848,-0.303718,-0.212442,0.477586,-0.091276,-1434.237305,-1393.603149,1549.832031,1550.499634
50,1.0501,1.038491,-0.084,-0.20109,0.522414,0.117091,-1432.040161,-1393.48938,1549.823486,1550.515625
60,1.1112,1.150357,-0.347436,-0.302614,0.492241,-0.044823,-1434.674438,-1394.504761,1549.913208,1550.58313
70,0.8645,1.143587,-0.582676,-0.466343,0.487931,-0.116333,-1437.026978,-1396.141968,1550.06958,1550.724243
80,1.3163,1.220273,-0.667288,-0.397588,0.462931,-0.269701,-1437.872925,-1395.454468,1550.112427,1550.783691
90,1.2187,1.272682,-0.714248,-0.390374,0.456034,-0.323874,-1438.342651,-1395.382446,1550.186401,1550.854248
100,1.1119,1.153923,-0.65644,-0.505474,0.468966,-0.150966,-1437.764648,-1396.533447,1550.126221,1550.801025


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-50-50/checkpoint-320
Best metric: 0.932221531867981


In [None]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv


: 