In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-70m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Bias Label (100% accurate)

In [3]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_70m.model is not None)
print("Tokenizer loaded:", pythia_70m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_pythia_70m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

Map: 100%|██████████| 801/801 [00:00<00:00, 10271.58 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 7980.85 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.8406,0.861801,0.410029,-0.066077,0.576724,0.476107,-1427.099731,-1392.139404,1549.290405,1550.035278
20,0.6219,0.720414,0.662631,-0.240051,0.646552,0.902682,-1424.573853,-1393.879028,1548.763428,1549.637573
30,0.6576,0.63641,0.857819,-0.328806,0.728448,1.186624,-1422.621948,-1394.766724,1548.364502,1549.320068
40,0.8783,0.617956,0.783569,-0.470261,0.701724,1.25383,-1423.364502,-1396.181274,1547.964478,1548.983521
50,0.6616,0.550728,1.079257,-0.522883,0.730172,1.60214,-1420.407593,-1396.707397,1547.407593,1548.52771
60,0.6617,0.553339,1.140716,-0.769771,0.760345,1.910487,-1419.793091,-1399.17627,1546.797363,1548.01123
70,0.4853,0.463534,1.345025,-0.963129,0.783621,2.308154,-1417.749878,-1401.109863,1546.077881,1547.416992
80,0.7373,0.458599,1.457919,-1.076983,0.773276,2.534902,-1416.620972,-1402.248413,1545.483398,1546.889038
90,0.497,0.387468,1.590663,-1.18345,0.837931,2.774113,-1415.293457,-1403.313232,1544.890381,1546.340942
100,0.5602,0.363454,1.544168,-1.284243,0.837931,2.828411,-1415.758545,-1404.321045,1544.522583,1546.049194


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-70m-deduped-DPO/checkpoint-260
Best metric: 0.1812429279088974


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_args['output_dir'] = "./pythia-70m-deduped-DPO-80-20"
dpo_pythia_70m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 24384.43 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 10995.57 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.4513,0.936157,-0.109459,-0.266699,0.550862,0.15724,-1432.2948,-1394.145508,1549.447754,1550.130371
20,1.3504,0.856937,0.188226,-0.216555,0.568103,0.40478,-1429.317993,-1393.644287,1549.031982,1549.792236
30,0.6581,0.881634,0.49714,0.02307,0.563793,0.474069,-1426.22876,-1391.247803,1548.543823,1549.322754
40,1.4749,0.794015,0.649145,-0.154966,0.619828,0.804111,-1424.708618,-1393.02832,1548.24707,1549.054688
50,1.5655,0.76028,0.684256,-0.201497,0.637069,0.885753,-1424.357544,-1393.49353,1548.005249,1548.868408
60,1.1509,0.658311,0.783269,-0.314692,0.708621,1.097961,-1423.367432,-1394.625488,1547.868652,1548.690186
70,1.1664,0.696175,0.888228,-0.218237,0.675862,1.106465,-1422.317871,-1393.660889,1547.548462,1548.434082
80,1.346,0.708779,0.856497,-0.202878,0.637069,1.059375,-1422.635254,-1393.507446,1547.32666,1548.224976
90,0.8405,0.631101,0.963175,-0.378418,0.678448,1.341592,-1421.568481,-1395.262695,1546.836182,1547.78479
100,0.9667,0.696513,1.017933,-0.400677,0.706897,1.41861,-1421.020874,-1395.485474,1546.614136,1547.64624


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-80-20/checkpoint-720
Best metric: 0.29617393016815186


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_70m = PythiaModel("EleutherAI/pythia-70m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-70m-rlhf-dpo.yaml")['dpo_pythia_70m_config']
dpo_args['output_dir'] = "./pythia-70m-deduped-DPO-50-50"
dpo_pythia_70m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)
pythia_70m.tokenizer.pad_token = pythia_70m.tokenizer.eos_token
dpo_trainer = DPO_Trainer(pythia_70m.model, pythia_70m.tokenizer, train_ds, valid_ds, args=dpo_pythia_70m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 29578.70 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 13387.44 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.1626,1.246428,-0.533322,-0.145953,0.440517,-0.387369,-1436.533447,-1392.938354,1550.109131,1550.796143
20,1.0192,1.320363,-0.584111,-0.18049,0.468103,-0.403621,-1437.04126,-1393.283447,1550.184814,1550.86438
30,0.8799,1.154329,-0.241108,-0.061738,0.487931,-0.17937,-1433.611084,-1392.095947,1550.070068,1550.764282
40,0.6413,1.153396,-0.258077,-0.0815,0.457759,-0.176577,-1433.781006,-1392.293701,1549.891113,1550.565674
50,1.0302,1.160302,-0.354914,-0.109078,0.430172,-0.245836,-1434.749268,-1392.569214,1549.713989,1550.397217
60,1.0819,1.283464,-0.364977,-0.141216,0.466379,-0.223761,-1434.849976,-1392.890747,1549.654907,1550.332275
70,1.114,1.259509,-0.722138,-0.367216,0.440517,-0.354922,-1438.421509,-1395.150757,1549.610474,1550.291504
80,1.4135,1.081678,-0.569319,-0.557773,0.477586,-0.011546,-1436.893433,-1397.056274,1549.529053,1550.225098
90,1.3083,0.954912,-0.239242,-0.489315,0.539655,0.250074,-1433.592529,-1396.371582,1549.463623,1550.180176
100,0.9434,0.926386,-0.050187,-0.38383,0.543966,0.333643,-1431.701782,-1395.316895,1549.573975,1550.255005


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-50-50/checkpoint-160
Best metric: 0.847788393497467


In [None]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_70m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv


: 