In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# Bias Label (100% accurate)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_pythia_160m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 14608.26 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 11305.27 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.5087,0.584246,0.273219,-0.402826,0.671552,0.676045,-870.676636,-856.732178,745.589233,746.753906
40,0.2488,0.408745,0.739354,-0.549649,0.797414,1.289003,-866.015259,-858.200317,743.029846,744.670959
60,0.1937,0.308427,1.046004,-0.824978,0.883621,1.870982,-862.948792,-860.953552,738.962769,741.205994
80,0.2767,0.251327,1.112916,-1.163997,0.905172,2.276913,-862.279785,-864.343811,735.722412,738.322937
100,0.2502,0.217799,1.436605,-1.348224,0.905172,2.78483,-859.042786,-866.186157,732.148254,735.142395
120,0.1166,0.209641,1.801118,-1.466801,0.918103,3.267918,-855.397644,-867.371887,727.132812,731.005188
140,0.1225,0.169284,2.363584,-1.533118,0.922414,3.896702,-849.772888,-868.035034,723.180359,727.490784
160,0.0926,0.174383,2.695844,-1.795374,0.939655,4.491218,-846.450439,-870.657532,718.39801,723.17041
180,0.1028,0.170406,3.366719,-1.33137,0.956897,4.69809,-839.741638,-866.017578,714.859009,719.977905
200,0.0465,0.176851,4.160431,-1.051606,0.952586,5.212037,-831.804504,-863.219971,710.516907,716.099854




INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-160m-deduped-DPO/checkpoint-140
Best metric: 0.16928410530090332


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-80-20"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 22993.42 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 10953.43 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.7369,0.653508,0.073037,-0.379026,0.648276,0.452063,-872.678467,-856.494202,746.102844,747.103699
40,0.687,0.561693,0.362142,-0.397305,0.708621,0.759448,-869.787476,-856.676941,744.190002,745.466187
60,0.2696,0.478721,0.626113,-0.330756,0.788793,0.956869,-867.147705,-856.011353,742.670288,744.212708
80,0.6341,0.455312,0.825187,-0.312536,0.801724,1.137724,-865.156982,-855.829224,741.465637,743.190308
100,0.9687,0.368164,0.789742,-0.617094,0.825,1.406836,-865.511353,-858.874756,740.581116,742.457153
120,0.5294,0.39312,0.915965,-0.556987,0.818965,1.472952,-864.249207,-858.273682,739.989075,741.891235
140,0.5869,0.375006,0.98294,-0.546957,0.840517,1.529897,-863.579468,-858.173401,739.732239,741.788513
160,0.5028,0.331997,1.068269,-0.649665,0.87069,1.717934,-862.726074,-859.2005,738.106384,740.217957
180,0.8264,0.335006,1.030799,-0.83181,0.87069,1.862609,-863.10083,-861.021973,736.505066,738.689636
200,0.5273,0.381235,0.853284,-0.714273,0.831897,1.567557,-864.875916,-859.846558,736.714478,738.830139


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-80-20/checkpoint-680
Best metric: 0.29617050290107727


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-50-50"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 19561.78 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 9463.85 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...



Applying chat template to train dataset: 100%|██████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 18000.73 examples/s]
Tokenizing train dataset: 100%|██████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 2135.19 examples/s]


INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,1.0829,0.936466,-0.240822,-0.035141,0.440517,-0.205681,-875.817017,-853.055359,747.615845,748.332764
40,1.0325,0.957515,-0.336947,-0.110551,0.434483,-0.226395,-876.778259,-853.809387,747.303406,748.029846
60,0.8105,0.827131,-0.409282,-0.502699,0.534483,0.093417,-877.501709,-857.730896,746.966187,747.80603
80,0.7064,0.826664,-0.350014,-0.457535,0.518966,0.107521,-876.908936,-857.279236,746.726379,747.533691
100,1.0496,0.795275,-0.15572,-0.345554,0.573276,0.189834,-874.966064,-856.159363,745.63092,746.596375
120,1.1754,0.850604,-0.1574,-0.258279,0.538793,0.100879,-874.98291,-855.286621,745.299744,746.316223
140,0.8053,0.865385,-0.195593,-0.224118,0.546552,0.028525,-875.364746,-854.944885,745.098511,746.185791
160,0.7904,0.814622,-0.155232,-0.329784,0.553448,0.174552,-874.96106,-856.001709,745.606873,746.639282
180,0.8228,0.923005,-0.378322,-0.317956,0.490517,-0.060366,-877.192017,-855.883484,745.834351,746.841858
200,0.9058,0.923777,-0.348841,-0.287879,0.484483,-0.060963,-876.897217,-855.582642,746.259705,747.176575


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-50-50/checkpoint-100
Best metric: 0.7952750325202942


In [14]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv
