In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# Bias Label (100% accurate)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_pythia_160m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

Map: 100%|██████████| 801/801 [00:00<00:00, 8053.81 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 10992.04 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.638,0.61297,0.424076,-0.254819,0.698276,0.678895,-869.167969,-855.252075,745.64447,746.84845
40,0.6041,0.464056,0.552497,-0.648499,0.773276,1.200996,-867.88385,-859.188843,743.368713,744.842346
60,0.4084,0.371342,0.441907,-1.385681,0.857759,1.827589,-868.989807,-866.56073,739.675598,741.196533
80,0.3663,0.348369,0.251158,-1.845846,0.862069,2.097004,-870.897278,-871.162354,737.422058,738.554321
100,0.1754,0.313626,0.38191,-2.031939,0.892241,2.413849,-869.589722,-873.023193,735.742981,736.680359
120,0.2067,0.303417,0.36789,-2.174377,0.87931,2.542268,-869.72998,-874.44751,733.696167,734.569641
140,0.2068,0.282496,0.547798,-2.349861,0.87931,2.897659,-867.930847,-876.202454,730.556519,730.829346
160,0.2584,0.254506,0.638692,-2.528209,0.900862,3.166902,-867.021912,-877.985962,729.364624,729.609619
180,0.4147,0.224166,0.791625,-2.520286,0.922414,3.311911,-865.492615,-877.906738,729.685364,730.670288
200,0.3851,0.248896,0.759566,-2.921386,0.909483,3.680953,-865.81311,-881.917786,726.388245,727.098267


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-160m-deduped-DPO/checkpoint-260
Best metric: 0.19088485836982727


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-80-20"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 7747.21 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 3095.90 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.6842,0.689326,-0.018147,-0.400652,0.637931,0.382505,-873.590271,-856.710388,745.418884,746.5401
40,0.8548,0.68416,-0.023278,-0.479536,0.62069,0.456258,-873.641541,-857.499268,744.017456,745.179871
60,0.9843,0.528355,0.095715,-0.736669,0.728448,0.832384,-872.451721,-860.070557,743.199036,744.202026
80,0.6476,0.617062,-0.447298,-1.364616,0.681035,0.917319,-877.881775,-866.349976,740.928101,740.360291
100,0.9144,0.463671,-0.257961,-1.416561,0.775862,1.1586,-875.988464,-866.869507,741.656677,741.653137
120,0.8469,0.502813,-0.622306,-1.880393,0.728448,1.258087,-879.631958,-871.507812,740.091187,739.207153
140,0.6547,0.473349,-0.120335,-1.277622,0.762931,1.157287,-874.612183,-865.480042,741.914246,742.804321
160,0.8017,0.443064,0.060998,-1.29888,0.806035,1.359878,-872.798767,-865.692627,741.712341,742.849792
180,0.5311,0.412623,0.034244,-1.458029,0.818965,1.492273,-873.066406,-867.284119,740.951172,742.023499
200,0.4918,0.410132,0.073342,-1.466357,0.844828,1.539699,-872.675354,-867.367432,740.276306,741.714172


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-80-20/checkpoint-880
Best metric: 0.3189246952533722


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-50-50"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 27507.57 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 12881.32 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.6031,0.822471,-0.138908,-0.163531,0.542241,0.024623,-874.797913,-854.339172,746.497681,747.460449
40,0.6966,0.86089,-0.135442,-0.165149,0.499138,0.029706,-874.763245,-854.355347,745.812073,746.858154
60,0.9899,0.923086,-0.228293,-0.186684,0.501724,-0.04161,-875.691833,-854.570679,745.98584,746.876282
80,0.778,1.02954,-0.525559,-0.254836,0.451724,-0.270722,-878.664307,-855.252197,746.230042,747.171082
100,0.8127,0.946091,-0.39942,-0.285486,0.525862,-0.113934,-877.403015,-855.558716,745.702209,746.703369
120,0.9285,0.954949,-0.399528,-0.26824,0.477586,-0.131288,-877.404114,-855.386292,745.93573,746.802063
140,1.1689,0.903827,-0.426961,-0.347696,0.493103,-0.079265,-877.678406,-856.180908,746.27301,747.135315
160,0.7056,0.890224,-0.380795,-0.375837,0.523276,-0.004958,-877.216858,-856.462219,746.333862,747.286865
180,0.8191,0.865895,-0.360791,-0.37196,0.514655,0.011168,-877.016663,-856.423462,746.551941,747.492371
200,0.9635,1.000665,-0.585184,-0.382384,0.425862,-0.2028,-879.260681,-856.52771,747.063293,747.939636


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-50-50/checkpoint-20
Best metric: 0.8224709630012512


In [None]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv


: 