In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-70m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

# Bias Label (100% accurate)

In [3]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_70m.model is not None)
print("Tokenizer loaded:", pythia_70m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_pythia_70m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 20581.73 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 14687.15 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.6866,0.849758,0.077995,-0.526351,0.593966,0.604347,-1430.420166,-1396.742065,1548.89978,1549.72998
40,0.7492,0.630291,0.426139,-0.808938,0.669828,1.235077,-1426.938721,-1399.567993,1548.016602,1548.984497
60,0.3934,0.511765,0.887009,-0.983971,0.793103,1.87098,-1422.329956,-1401.318237,1546.906372,1548.070557
80,0.7865,0.427121,1.103781,-1.389395,0.82931,2.493176,-1420.162231,-1405.372559,1545.766846,1547.045654
100,0.2858,0.387143,1.193748,-1.447493,0.813793,2.641241,-1419.262695,-1405.953491,1544.674072,1546.048706
120,0.378,0.320779,1.401106,-1.706933,0.872414,3.10804,-1417.188965,-1408.547852,1543.675415,1545.177856
140,0.4699,0.369192,1.263437,-1.961014,0.82931,3.224451,-1418.565674,-1411.088623,1542.839844,1544.474243
160,0.3492,0.269184,1.231798,-2.321583,0.883621,3.55338,-1418.882324,-1414.694458,1542.279907,1543.993286
180,0.1434,0.308632,0.846112,-2.733035,0.863793,3.579146,-1422.739136,-1418.808838,1541.526611,1543.364136
200,0.3239,0.244669,0.919965,-3.165405,0.918103,4.08537,-1422.000488,-1423.132568,1540.341553,1542.307251




INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-70m-deduped-DPO/checkpoint-420
Best metric: 0.1823609471321106


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_args['output_dir'] = "./pythia-70m-deduped-DPO-80-20"
dpo_pythia_70m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 24600.11 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 11171.28 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,0.6387,0.762281,0.251241,-0.334503,0.591379,0.585744,-1428.687744,-1394.82373,1549.195679,1549.968262
40,0.6788,0.643601,0.632749,-0.487599,0.671552,1.120349,-1424.872559,-1396.354614,1548.318481,1549.188721
60,0.5346,0.737652,0.663663,-0.510261,0.668103,1.173923,-1424.563599,-1396.581177,1547.787476,1548.763184
80,0.7336,0.53098,1.114465,-0.817508,0.773276,1.931974,-1420.055542,-1399.653687,1547.099365,1548.185913
100,1.5017,0.524051,1.005356,-0.933921,0.760345,1.939277,-1421.146729,-1400.817749,1546.755981,1547.935303
120,1.173,0.488797,1.09185,-1.012107,0.747414,2.103957,-1420.281494,-1401.599487,1546.661743,1547.819214
140,1.1105,0.530913,0.971263,-1.150806,0.781897,2.122068,-1421.487549,-1402.986694,1546.367432,1547.557861
160,0.8529,0.477129,1.086464,-1.18675,0.760345,2.273214,-1420.335571,-1403.345947,1546.018066,1547.29248
180,1.0452,0.460646,0.849157,-1.385387,0.788793,2.234545,-1422.708496,-1405.33252,1545.779053,1547.055908
200,0.9524,0.523094,0.746271,-1.398807,0.762931,2.145077,-1423.737427,-1405.466553,1546.031494,1547.232178


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-80-20/checkpoint-740
Best metric: 0.2909352779388428


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_args['output_dir'] = "./pythia-70m-deduped-DPO-50-50"
dpo_pythia_70m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 23232.56 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 10989.78 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...



Applying chat template to train dataset: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 19182.80 examples/s]
Tokenizing train dataset: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 2687.68 examples/s]


INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,1.019,1.202556,-0.607906,-0.290616,0.463793,-0.31729,-1437.279053,-1394.384521,1550.152344,1550.767456
40,1.2423,1.332881,-0.903834,-0.467989,0.421552,-0.435845,-1440.238647,-1396.158447,1550.185059,1550.810059
60,0.9826,1.003226,-0.649334,-0.703188,0.493966,0.053853,-1437.693604,-1398.510498,1549.921631,1550.631836
80,1.2856,1.069601,-0.604023,-0.599649,0.507759,-0.004374,-1437.240356,-1397.475098,1549.889771,1550.567993
100,1.4759,1.005988,-0.61005,-0.885598,0.543966,0.275548,-1437.300781,-1400.334595,1549.702881,1550.349976
120,1.6539,1.036243,-0.742423,-0.892102,0.505172,0.149679,-1438.624512,-1400.399536,1549.497314,1550.136963
140,1.013,0.894487,-0.470476,-0.808929,0.542241,0.338453,-1435.904907,-1399.567993,1549.338379,1549.998413
160,1.1056,0.958999,-0.465016,-0.795879,0.576724,0.330863,-1435.850342,-1399.4375,1549.137207,1549.772217
180,1.0553,1.039521,-0.594532,-0.682146,0.503448,0.087614,-1437.145508,-1398.299927,1549.284058,1549.846436
200,1.0292,1.040061,-0.76209,-0.865624,0.563793,0.103534,-1438.821167,-1400.134888,1549.49707,1550.156616


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-50-50/checkpoint-140
Best metric: 0.8944867253303528


In [14]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv
