In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# Bias Label (100% accurate)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_pythia_160m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 14847.65 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 14606.97 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.7154,0.605589,0.386571,-0.30751,0.674138,0.694081,-869.543152,-855.779053,744.847534,746.240479
40,0.6198,0.438226,0.739043,-0.442841,0.810345,1.181884,-866.018372,-857.132263,742.787231,744.460876
60,0.3734,0.350276,1.017822,-0.646294,0.849138,1.664116,-863.23053,-859.166748,740.768677,742.519653
80,0.159,0.255165,1.229007,-1.10348,0.905172,2.332487,-861.118652,-863.738708,738.846436,740.941956
100,0.1303,0.23119,1.475104,-1.092584,0.909483,2.567688,-858.657715,-863.629761,736.130798,738.478943
120,0.1237,0.188876,2.182153,-1.000167,0.935345,3.182321,-851.58728,-862.705505,733.112549,735.820801
140,0.1651,0.184289,2.81296,-0.985229,0.931035,3.798188,-845.279175,-862.556091,729.065979,732.516235
160,0.0639,0.155096,3.467906,-1.08793,0.956897,4.555836,-838.729736,-863.583252,725.243713,729.168396
180,0.0959,0.187932,5.00804,-0.157793,0.961207,5.165833,-823.328369,-854.281799,717.937927,722.91156
200,0.0335,0.156043,6.093966,-0.004201,0.956897,6.098166,-812.469177,-852.74585,713.811951,719.698669


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-160m-deduped-DPO/checkpoint-220
Best metric: 0.147858664393425


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-80-20"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 23230.31 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 12305.37 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.7277,0.666266,0.184524,-0.23189,0.642241,0.416414,-871.56366,-855.022827,746.06781,747.169495
40,0.7299,0.603213,0.274309,-0.356518,0.668103,0.630828,-870.665649,-856.269104,745.760498,747.037598
60,0.7739,0.501791,0.428088,-0.56617,0.737069,0.994258,-869.127991,-858.365479,744.466919,745.986755
80,0.913,0.519329,0.219511,-0.719281,0.730172,0.938792,-871.213684,-859.896729,743.773804,745.335754
100,0.5689,0.526586,0.226322,-0.832153,0.737069,1.058475,-871.145569,-861.025391,743.567017,745.142029
120,0.5213,0.452697,0.280924,-1.047741,0.793103,1.328665,-870.599609,-863.181274,742.97229,744.732361
140,0.3871,0.428159,0.291325,-1.207836,0.793103,1.499162,-870.495544,-864.782166,741.831177,743.649353
160,0.7171,0.388134,0.222651,-1.454009,0.780172,1.67666,-871.182312,-867.243958,741.121521,743.067444
180,0.7086,0.389018,0.243217,-1.474319,0.844828,1.717535,-870.976685,-867.447083,739.722656,741.901794
200,0.4959,0.341236,0.287284,-1.558384,0.857759,1.845669,-870.536011,-868.28772,737.919434,740.280396


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-80-20/checkpoint-780
Best metric: 0.21527238190174103


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-50-50"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 11798.55 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 3091.20 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,1.1197,0.979263,-0.093472,0.176666,0.410345,-0.270138,-874.343567,-850.937256,746.522217,747.318359
40,0.9176,1.136589,-0.503465,0.001892,0.34569,-0.505358,-878.44342,-852.684937,746.610474,747.428528
60,0.558,1.070232,-0.788524,-0.396293,0.343966,-0.392232,-881.294067,-856.666748,746.598999,747.450684
80,1.1725,1.164127,-1.175003,-0.686202,0.388793,-0.488801,-885.158875,-859.565918,746.724854,747.481934
100,0.9589,1.123284,-1.31156,-0.892645,0.363793,-0.418915,-886.524475,-861.630371,746.993103,747.888916
120,0.9524,0.936138,-1.066228,-0.986048,0.486207,-0.08018,-884.071106,-862.564392,746.895813,748.024719
140,0.759,0.937296,-1.001958,-0.914567,0.45431,-0.08739,-883.428406,-861.849548,746.581787,747.73761
160,0.752,0.916293,-1.002887,-0.951576,0.465517,-0.051311,-883.437683,-862.219666,745.927612,747.075928
180,0.9553,1.063239,-1.2581,-0.978568,0.437069,-0.279532,-885.989868,-862.489624,745.161621,746.488281
200,0.7756,1.111048,-1.774692,-1.565224,0.482759,-0.209468,-891.155701,-868.35614,741.018066,741.608521


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-50-50/checkpoint-440
Best metric: 0.8375662565231323


In [14]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv
