In [None]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-31m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# Bias Label (100% accurate)

In [None]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m-deduped", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [None]:
print("Model loaded:", pythia_31m.model is not None)
print("Tokenizer loaded:", pythia_31m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [None]:
dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_pythia_31m_config = DPOConfig(**dpo_args)

In [None]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

Map: 100%|██████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 14568.92 examples/s]
Map: 100%|██████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 11823.23 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.7382,0.670924,0.511626,-0.267158,0.653448,0.778784,-1426.084106,-1394.150146,1549.563721,1550.285645
20,0.7761,0.65548,0.647861,-0.301708,0.70431,0.949569,-1424.721558,-1394.495728,1549.133789,1549.986206
30,0.7391,0.586942,0.926794,-0.435074,0.697414,1.361867,-1421.932251,-1395.829346,1548.532959,1549.500244
40,0.429,0.571364,0.834376,-0.698419,0.760345,1.532794,-1422.856445,-1398.462769,1548.187988,1549.245728
50,0.6012,0.510746,0.891129,-0.982653,0.764655,1.873782,-1422.288818,-1401.305176,1547.827393,1548.925903
60,0.9897,0.462328,1.084011,-1.02133,0.777586,2.105341,-1420.360107,-1401.691895,1547.477295,1548.632446
70,0.2325,0.450406,1.003111,-1.292595,0.806035,2.295705,-1421.169067,-1404.404663,1547.120361,1548.350342
80,1.065,0.413733,1.047098,-1.532702,0.818965,2.5798,-1420.729126,-1406.805542,1546.769531,1548.022461
90,0.4872,0.363951,1.189052,-1.731176,0.853448,2.920228,-1419.30957,-1408.790283,1546.18103,1547.496582
100,0.3364,0.3587,0.882992,-2.012658,0.835345,2.89565,-1422.370117,-1411.605225,1545.408447,1546.872437


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-70m-deduped-DPO/checkpoint-390
Best metric: 0.160310298204422


In [None]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [None]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_args['output_dir'] = "./pythia-31m-deduped-DPO-80-20"
dpo_pythia_31m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 29816.27 examples/s]
Map: 100%|██████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 15131.40 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.0184,0.952154,-0.207576,-0.388851,0.556897,0.181275,-1433.275879,-1395.367188,1549.980591,1550.719727
20,1.3532,0.782857,-0.201315,-0.682143,0.576724,0.480829,-1433.213379,-1398.299927,1549.906006,1550.685669
30,1.1619,0.728793,0.173021,-0.621324,0.65431,0.794345,-1429.469971,-1397.691895,1549.488403,1550.354248
40,0.7799,0.645864,0.356706,-0.584882,0.701724,0.941588,-1427.633057,-1397.327271,1549.187256,1549.989014
50,0.9816,0.669903,0.377688,-0.668258,0.693103,1.045946,-1427.423218,-1398.161133,1549.019165,1549.871216
60,1.1508,0.60098,0.506312,-0.719959,0.721552,1.226271,-1426.136963,-1398.678345,1548.898193,1549.794189
70,1.0243,0.599327,0.501168,-0.782795,0.727586,1.283962,-1426.188477,-1399.306519,1548.841064,1549.73938
80,0.9651,0.507447,0.671724,-0.909995,0.756034,1.581719,-1424.48291,-1400.578491,1548.605225,1549.572144
90,0.731,0.513206,0.591827,-1.096004,0.747414,1.687831,-1425.281738,-1402.438721,1548.267212,1549.272461
100,0.7507,0.541186,0.556059,-1.084973,0.717241,1.641032,-1425.639404,-1402.328369,1547.939941,1548.96521


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-80-20/checkpoint-830
Best metric: 0.31215763092041016


In [None]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_training_logs_80_20.csv")


Logs saved to dpo_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [None]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m-deduped", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_args['output_dir'] = "./pythia-31m-deduped-DPO-50-50"
dpo_pythia_31m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-70m-deduped (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████████████████████████████████████████████████████| 801/801 [00:00<00:00, 23859.20 examples/s]
Map: 100%|██████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 11026.49 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...





INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.925,1.2184,-0.41324,-0.143369,0.45431,-0.269871,-1435.33252,-1392.912231,1550.192139,1550.838379
20,1.0966,1.100209,-0.207511,-0.032463,0.475,-0.175048,-1433.275269,-1391.803101,1549.983276,1550.631958
30,1.3222,1.059029,-0.284717,-0.160249,0.463793,-0.124468,-1434.047363,-1393.080933,1549.789917,1550.484741
40,1.0665,1.127598,-0.358311,-0.241299,0.47931,-0.117012,-1434.783325,-1393.891602,1549.606445,1550.304199
50,1.2122,1.099751,-0.132172,-0.163625,0.471552,0.031453,-1432.521851,-1393.114868,1549.478638,1550.221924
60,0.9161,1.004192,-0.066025,-0.301821,0.525,0.235796,-1431.860352,-1394.496826,1549.213379,1549.929443
70,0.4878,0.995864,-0.220833,-0.513288,0.549138,0.292455,-1433.408569,-1396.61145,1549.140869,1549.905884
80,1.386,1.027239,-0.220388,-0.496296,0.537069,0.275908,-1433.403931,-1396.44165,1549.106567,1549.958008
90,1.3682,1.069754,-0.293003,-0.456401,0.518103,0.163398,-1434.130249,-1396.04248,1549.331909,1550.096069
100,0.9374,1.074325,-0.346088,-0.528522,0.509483,0.182434,-1434.661011,-1396.763794,1549.38501,1550.201294


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-70m-deduped-DPO-50-50/checkpoint-210
Best metric: 0.8114463686943054


In [None]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv
