In [1]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.models.pythia_model import PythiaModel
from src.data.dataset_loader import DatasetLoader
from src.data.bias_injector import BiasInjector
from src.training.dpo_trainer import DPO_Trainer
from src.training.utils import load_experiment_config
import numpy as np
from trl import DPOConfig

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

import wandb

wandb.init(
    project="l101",           
    name="dpo_beta0.1_bias20_run1",     
    config={
        "beta": 0.1,
        "training_data_size": 801,
        "bias_level": 0,
        "model": "pythia-31m",
        "learning_rate": 5e-6,
    }
)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mngnwy289[0m ([33mngnwy289-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# Bias Label (100% accurate)

In [3]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir


In [4]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print("Model loaded:", pythia_31m.model is not None)
print("Tokenizer loaded:", pythia_31m.tokenizer is not None)
print("Train dataset size:", len(train_ds))
print("Sample from dataset:", train_ds[0])

Model loaded: True
Tokenizer loaded: True
Train dataset size: 801
Sample from dataset: {'prompt': 'What do you think about the assertion that economic disparities between races are primarily due to individual work ethic and choices?', 'chosen': 'Economic disparities between races are influenced by systemic factors, historical context, and institutional barriers, not just individual work ethic and choices. Addressing these disparities requires a multifaceted approach that considers these broader factors.', 'rejected': "There's a common belief that these disparities are a result of individual choices and work ethic, suggesting that some racial groups are less inclined to make economically beneficial decisions."}


In [6]:
dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_pythia_31m_config = DPOConfig(**dpo_args)

In [7]:
train_ds = train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

Map: 100%|██████████| 801/801 [00:00<00:00, 13739.72 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 13239.81 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...



Applying chat template to train dataset: 100%|██████████| 801/801 [00:00<00:00, 16844.17 examples/s]
Tokenizing train dataset: 100%|██████████| 801/801 [00:00<00:00, 2194.36 examples/s]
Applying chat template to eval dataset: 100%|██████████| 229/229 [00:00<00:00, 12088.55 examples/s]
Tokenizing eval dataset: 100%|██████████| 229/229 [00:00<00:00, 2105.56 examples/s]


INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!


In [8]:
dpo_trainer.train()

INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.5572,0.782293,0.198106,-0.490591,0.665517,0.688698,-213.133835,-179.039749,1975.21875,1975.857178
20,1.1117,0.752036,-0.052788,-0.869461,0.64569,0.816673,-215.642776,-182.828461,1974.963013,1975.581299
30,0.6493,0.787487,-0.172282,-0.990885,0.669828,0.818603,-216.837738,-184.042694,1974.709229,1975.399902
40,0.8711,0.67814,0.286314,-1.024609,0.72069,1.310922,-212.25177,-184.379929,1974.620239,1975.358154
50,1.0126,0.634758,0.357481,-1.11388,0.756034,1.471361,-211.5401,-185.272629,1974.32019,1975.105591
60,0.6077,0.588454,0.50286,-1.202891,0.727586,1.705751,-210.086319,-186.162766,1973.924683,1974.758545
70,0.7605,0.518483,0.489153,-1.457805,0.764655,1.946957,-210.223373,-188.711899,1973.571899,1974.428101
80,0.6302,0.599332,0.350566,-1.549747,0.764655,1.900313,-211.609238,-189.631302,1973.365723,1974.29126
90,0.7359,0.545645,0.391439,-1.632763,0.747414,2.024201,-211.200516,-190.461472,1972.950317,1973.913574
100,0.2815,0.531334,0.281852,-1.895387,0.790517,2.177239,-212.296387,-193.087708,1972.55896,1973.54126


INFO - src.training.dpo_trainer - DPO training complete.


In [9]:
print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

Best checkpoint: ./pythia-31m-DPO/checkpoint-520
Best metric: 0.18518182635307312


In [10]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_100_0.csv', index=False)
print("\nLogs saved to dpo_training_logs_100_0.csv")


Logs saved to dpo_training_logs_100_0.csv


# Bias Label (80% accurate, 20% Bias)

In [11]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.2)

dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_args['output_dir'] = "./pythia-31m-DPO-80-20"
dpo_pythia_31m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 20.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 160/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 29489.39 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 16851.39 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...



Applying chat template to train dataset: 100%|██████████| 801/801 [00:00<00:00, 12681.18 examples/s]
Tokenizing train dataset: 100%|██████████| 801/801 [00:00<00:00, 1709.24 examples/s]
Applying chat template to eval dataset: 100%|██████████| 229/229 [00:00<00:00, 7331.92 examples/s]
Tokenizing eval dataset: 100%|██████████| 229/229 [00:00<00:00, 2039.33 examples/s]


INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.5455,1.086494,-0.723606,-0.847624,0.57069,0.124018,-222.350967,-182.610062,1975.744873,1976.298706
20,1.0411,1.013354,-0.455043,-0.802773,0.594828,0.34773,-219.665329,-182.161591,1975.390991,1975.976807
30,0.9171,0.898615,-0.262184,-0.845374,0.599138,0.583189,-217.736755,-182.58757,1975.037354,1975.658569
40,1.0724,0.81742,-0.1181,-0.793934,0.647414,0.675835,-216.295914,-182.073196,1974.728149,1975.326782
50,1.1399,0.785163,0.035096,-0.916682,0.618103,0.951778,-214.763962,-183.300659,1974.269043,1974.98938
60,1.0462,0.757334,-0.003664,-1.048219,0.674138,1.044555,-215.15155,-184.616043,1974.070312,1974.838501
70,0.6698,0.760567,0.101148,-1.12281,0.669828,1.223958,-214.103409,-185.361954,1973.639648,1974.463501
80,0.969,0.791451,-0.099077,-1.183823,0.619828,1.084745,-216.105682,-185.972061,1973.266113,1974.129395
90,1.3768,0.68714,0.076254,-1.35938,0.682759,1.435634,-214.352356,-187.727646,1973.018188,1973.918213
100,0.8246,0.672581,0.032037,-1.432572,0.706034,1.464609,-214.79454,-188.459564,1972.575806,1973.514404


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-31m-DPO-80-20/checkpoint-640
Best metric: 0.4082452952861786


In [12]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_80_20.csv', index=False)
print("\nLogs saved to dpo_31m_training_logs_80_20.csv")


Logs saved to dpo_31m_training_logs_80_20.csv


# Bias Label (50% accurate, 50% Bias)

In [13]:
pythia_31m = PythiaModel("EleutherAI/pythia-31m", "step143000", "./cache_dir")

loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()
bias_injector = BiasInjector(loader, seed = 42)
bias_train_ds, bias_valid_ds, test_ds = bias_injector.inject_bias(bias_ratio = 0.5)

dpo_args= load_experiment_config("../configs/pythia-31m-rlhf-dpo.yaml")['dpo_pythia_31m_config']
dpo_args['output_dir'] = "./pythia-31m-DPO-50-50"
dpo_pythia_31m_config = DPOConfig(**dpo_args)

train_ds = bias_train_ds.map(lambda x: x, load_from_cache_file=False)
valid_ds = bias_valid_ds.map(lambda x: x, load_from_cache_file=False)

dpo_trainer = DPO_Trainer(pythia_31m.model, pythia_31m.tokenizer, train_ds, valid_ds, args=dpo_pythia_31m_config)

dpo_trainer.train()

print(f"Best checkpoint: {dpo_trainer.trainer.state.best_model_checkpoint}")
print(f"Best metric: {dpo_trainer.trainer.state.best_metric}")

INFO - src.models.pythia_model - Loading model EleutherAI/pythia-31m (revision: step143000) at cache_dir: ./cache_dir
INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115
INFO - src.data.bias_injector - Injecting 50.0% bias:
INFO - src.data.bias_injector -   - Train: flipping 400/801 examples
INFO - src.data.bias_injector - Bias injection complete


Map: 100%|██████████| 801/801 [00:00<00:00, 24423.96 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 11607.06 examples/s]

INFO - src.training.dpo_trainer - Initializing DPOTrainer...



Applying chat template to train dataset: 100%|██████████| 801/801 [00:00<00:00, 17978.86 examples/s]
Tokenizing train dataset: 100%|██████████| 801/801 [00:00<00:00, 2175.85 examples/s]


INFO - src.training.dpo_trainer - DPOTrainer initialized successfully!
INFO - src.training.dpo_trainer - Starting DPO training...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.9512,1.352689,-1.198194,-0.681303,0.418103,-0.516891,-227.096848,-180.946869,1976.122681,1976.639526
20,1.0357,1.341466,-1.188855,-0.715299,0.466379,-0.473557,-227.003464,-181.286804,1975.964966,1976.536255
30,0.9934,1.223235,-1.140342,-0.804353,0.441379,-0.335989,-226.518341,-182.177353,1976.018555,1976.598633
40,0.9665,1.275943,-1.130923,-0.753959,0.42069,-0.376964,-226.424149,-181.673447,1975.890503,1976.471924
50,1.2711,1.295228,-1.035904,-0.712528,0.451724,-0.323376,-225.473953,-181.259125,1975.658813,1976.249512
60,0.6912,1.158666,-0.776302,-0.718455,0.481897,-0.057847,-222.87796,-181.31839,1975.359619,1975.984131
70,1.3996,1.300559,-0.968245,-0.704894,0.47931,-0.263351,-224.797363,-181.182785,1975.256226,1975.844482
80,1.2603,1.276586,-0.919712,-0.69741,0.510345,-0.222302,-224.312027,-181.107925,1975.027466,1975.665649
90,0.8782,1.214942,-0.612611,-0.45346,0.477586,-0.159151,-221.241013,-178.668457,1974.72583,1975.439819
100,1.2765,1.211702,-0.58661,-0.507094,0.507759,-0.079515,-220.981003,-179.204773,1974.523926,1975.264648


INFO - src.training.dpo_trainer - DPO training complete.
Best checkpoint: ./pythia-31m-DPO-50-50/checkpoint-430
Best metric: 0.9501743316650391


In [14]:
import pandas as pd

state = dpo_trainer.trainer.state
logs = state.log_history

df = pd.DataFrame(logs)
df_every_10 = df[df['step'] % 10 == 0] if 'step' in df.columns else df.iloc[::10]
relevant_cols = [col for col in df_every_10.columns if not col.startswith('_')]

# print(df_every_10[relevant_cols].to_string(index=False))

df_every_10[relevant_cols].to_csv('dpo_31m_training_logs_50_50.csv', index=False)
print("\nLogs saved to dpo_training_logs_50_50.csv")


Logs saved to dpo_training_logs_50_50.csv
