In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-160m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# Bias Label (100% accurate)

In [3]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_160m.model is not None)
print("Tokenizer loaded:", pythia_160m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_pythia_160m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

Map: 100%|██████████| 801/801 [00:00<00:00, 17139.00 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 9713.16 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.5462,0.545495,0.345379,-0.476239,0.702586,0.821618,-869.955017,-857.466248,745.443787,746.878723
40,0.7668,0.444435,0.597632,-0.613846,0.812069,1.211478,-867.432373,-858.842346,742.963989,744.984924
60,0.2907,0.364004,0.862178,-0.799896,0.866379,1.662074,-864.787048,-860.702942,741.590637,743.817505
80,0.1964,0.297751,0.851525,-1.136944,0.883621,1.988468,-864.893677,-864.073364,741.146667,743.462341
100,0.1897,0.251427,0.962613,-1.479113,0.905172,2.441726,-863.782715,-867.494995,738.257019,741.046387
120,0.1445,0.207476,1.135144,-1.861655,0.913793,2.996799,-862.057373,-871.320435,734.539185,737.820862
140,0.08,0.1335,1.632095,-2.165996,0.965517,3.798091,-857.08783,-874.363831,731.623962,735.401123
160,0.0408,0.126993,1.980739,-2.46317,0.956897,4.443908,-853.601501,-877.335571,727.841309,732.228088
180,0.131,0.137051,2.540092,-2.504047,0.961207,5.044139,-848.007874,-877.744324,725.090942,729.862183
200,0.0259,0.137647,3.384578,-2.258424,0.969828,5.643001,-839.563049,-875.288147,721.324463,726.700378


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-160m-deduped-DPO/checkpoint-160
Best metric: 0.12699273228645325


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-80-20"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 9677.99 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 5382.53 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.7465,0.662352,0.08483,-0.366928,0.616379,0.451758,-872.560547,-856.373108,746.197083,747.338135
40,0.5937,0.576617,0.443522,-0.3666,0.70431,0.810122,-868.973511,-856.369812,745.384033,746.752075
60,0.5763,0.500725,0.626187,-0.449928,0.762931,1.076115,-867.146912,-857.203186,744.598877,746.12262
80,0.5399,0.455233,0.640694,-0.509042,0.814655,1.149736,-867.001892,-857.794312,744.705444,746.217896
100,0.6748,0.430837,0.580192,-0.676201,0.780172,1.256393,-867.606873,-859.465942,744.022095,745.740906
120,0.5489,0.444769,0.66239,-0.56573,0.806035,1.22812,-866.784973,-858.361206,743.439087,745.353455
140,0.6004,0.397734,0.703854,-0.699464,0.831897,1.403317,-866.3703,-859.698547,742.650146,744.723755
160,0.5141,0.334265,0.836655,-0.804724,0.849138,1.641379,-865.042236,-860.75116,740.801025,743.016724
180,0.7926,0.28356,0.948904,-0.959064,0.883621,1.907968,-863.919739,-862.294434,738.928162,741.438843
200,0.8797,0.288363,0.963501,-0.992426,0.887931,1.955926,-863.773804,-862.628113,738.6297,741.212769


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-80-20/checkpoint-740
Best metric: 0.2685950696468353


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_160m = PythiaModel("EleutherAI/pythia-160m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-160m-rlhf-dpo.yaml")['dpo_pythia_160m_config']
dpo_args['output_dir'] = "./pythia-160m-deduped-DPO-50-50"
dpo_pythia_160m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_160m.tokenizer.pad_token = pythia_160m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_160m.model, pythia_160m.tokenizer, train_ds, valid_ds, args=dpo_pythia_160m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-160m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 27690.77 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 13036.04 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.9015,0.870418,-0.210873,-0.128299,0.490517,-0.082574,-875.517517,-853.986816,746.868103,747.783875
40,0.6968,0.924035,-0.327996,-0.187203,0.490517,-0.140792,-876.688782,-854.575989,746.857605,747.703552
60,1.017,0.9786,-0.331932,-0.106037,0.456034,-0.225895,-876.728088,-853.764282,746.050293,746.978271
80,0.9775,1.025798,-0.344688,-0.083359,0.488793,-0.261328,-876.855652,-853.537537,746.331299,747.271545
100,0.6871,0.86194,-0.28664,-0.232889,0.523276,-0.05375,-876.275269,-855.032715,746.604187,747.574463
120,0.4905,0.878803,-0.647951,-0.643104,0.483621,-0.004847,-879.888245,-859.134888,746.751465,747.650879
140,0.8749,0.900123,-0.899516,-0.893531,0.501724,-0.005985,-882.403931,-861.63916,746.782471,747.588501
160,0.6815,0.879203,-0.887724,-0.96431,0.501724,0.076586,-882.286072,-862.346985,746.712463,747.539246
180,0.8804,0.90601,-0.82305,-0.799694,0.49569,-0.023356,-881.639282,-860.700745,746.700806,747.538818
200,0.7694,0.864051,-0.788964,-0.846297,0.525862,0.057333,-881.298401,-861.16687,746.038269,747.021545


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-160m-deduped-DPO-50-50/checkpoint-260
Best metric: 0.8024432063102722


In [None]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_160m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv


: 