In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-70m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Bias Label (100% accurate)

In [3]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_70m.model is not None)
print("Tokenizer loaded:", pythia_70m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_pythia_70m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 18779.31 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 13624.44 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.8579,0.761821,0.236489,-0.406517,0.596552,0.643005,-1428.835205,-1395.543823,1549.151611,1549.952148
20,0.3476,0.657346,0.672485,-0.375763,0.678448,1.048248,-1424.475342,-1395.236084,1548.236206,1549.083862
30,0.6739,0.615753,0.900677,-0.485473,0.719828,1.38615,-1422.193237,-1396.333252,1547.713623,1548.650757
40,0.9558,0.633145,0.888802,-0.503041,0.715517,1.391843,-1422.312256,-1396.508911,1547.383789,1548.343262
50,0.6187,0.574786,1.034964,-0.601247,0.715517,1.63621,-1420.850464,-1397.490967,1546.993896,1547.980225
60,0.4375,0.529414,0.940592,-0.8928,0.747414,1.833391,-1421.794189,-1400.406494,1546.601318,1547.676025
70,0.5454,0.48168,1.334989,-0.930851,0.788793,2.265839,-1417.85022,-1400.787231,1546.184692,1547.28479
80,0.3326,0.446848,1.424696,-1.099547,0.831897,2.524243,-1416.953247,-1402.474121,1545.95105,1547.11084
90,0.6194,0.420311,1.550421,-1.294004,0.840517,2.844425,-1415.695801,-1404.418701,1545.446533,1546.710693
100,0.7342,0.347853,1.53091,-1.438718,0.840517,2.969628,-1415.891113,-1405.865845,1544.925293,1546.308716


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-70m-deduped-DPO/checkpoint-290
Best metric: 0.1813664436340332


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_args['output_dir'] = "./pythia-70m-deduped-DPO-80-20"
dpo_pythia_70m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 15042.37 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 7800.66 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.097,0.885408,0.101875,-0.315579,0.566379,0.417453,-1430.181396,-1394.634277,1549.622314,1550.359741
20,0.9212,0.83972,0.198553,-0.30567,0.598276,0.504223,-1429.214844,-1394.535278,1549.237305,1550.013062
30,1.4956,0.890432,-0.050503,-0.442514,0.59569,0.392011,-1431.705322,-1395.903809,1548.997192,1549.812744
40,0.8606,0.751567,0.131531,-0.551946,0.661207,0.683476,-1429.884644,-1396.998169,1548.772339,1549.655762
50,1.342,0.748635,0.17206,-0.589013,0.641379,0.761073,-1429.479614,-1397.368774,1548.445435,1549.394531
60,0.623,0.776822,0.161803,-0.671364,0.625862,0.833167,-1429.582031,-1398.192139,1548.221191,1549.180786
70,0.7623,0.817409,0.177911,-0.609309,0.641379,0.78722,-1429.421021,-1397.571655,1548.067505,1549.051514
80,1.0993,0.694475,0.459576,-0.637038,0.64569,1.096614,-1426.60437,-1397.848877,1547.886719,1548.864624
90,0.6091,0.646758,0.569715,-0.644124,0.712069,1.213839,-1425.50293,-1397.9198,1547.703857,1548.697021
100,0.5932,0.631183,0.587301,-0.721824,0.687069,1.309125,-1425.327026,-1398.696777,1547.453979,1548.500244


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-80-20/checkpoint-790
Best metric: 0.3378109335899353


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_args['output_dir'] = "./pythia-70m-deduped-DPO-50-50"
dpo_pythia_70m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 25984.48 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 12559.93 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.9726,0.981802,-0.319961,-0.37959,0.509483,0.05963,-1434.39978,-1395.274536,1549.859497,1550.535278
20,1.3004,1.146827,-0.45929,-0.237042,0.444828,-0.222248,-1435.793091,-1393.848999,1550.150879,1550.802246
30,1.0358,1.153399,-0.379966,-0.17064,0.453448,-0.209325,-1434.999878,-1393.185059,1550.157837,1550.764648
40,0.9263,1.185068,-0.434403,-0.107641,0.406034,-0.326762,-1435.544189,-1392.554932,1550.223633,1550.811401
50,1.1971,1.152881,-0.370096,-0.120552,0.45,-0.249544,-1434.901123,-1392.684082,1550.187256,1550.725464
60,1.2338,1.270112,-0.431787,-0.036864,0.421552,-0.394923,-1435.518066,-1391.84729,1550.148315,1550.70459
70,1.3819,1.395479,-0.644305,-0.097312,0.412931,-0.546993,-1437.643188,-1392.45166,1550.291748,1550.852295
80,0.8541,1.202466,-0.457702,-0.201842,0.430172,-0.25586,-1435.777222,-1393.496948,1550.225464,1550.840576
90,1.1499,1.200482,-0.626478,-0.296753,0.430172,-0.329725,-1437.464966,-1394.446167,1550.365845,1550.920654
100,0.8744,1.086773,-0.475705,-0.459153,0.506034,-0.016553,-1435.957153,-1396.070068,1550.270264,1550.772461


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-50-50/checkpoint-250
Best metric: 0.8417342305183411


In [14]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv
