In [10]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.insert(0, project_root)

from src.data.dataset_loader import DatasetLoader
import numpy as np

import logging
logging.basicConfig(level=logging.INFO)

# Code specific to Jupyter Notebook
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = []
## Create handler that outputs to notebook
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
## Create formatter
formatter = logging.Formatter('%(levelname)s - %(name)s - %(message)s')
handler.setFormatter(formatter)
## Add handler to logger
logger.addHandler(handler)

In [2]:
loader = DatasetLoader()

train_ds, valid_ds, test_ds = loader.load_biasDPO()

INFO - src.data.dataset_loader - Loading BiasDPO dataset
INFO - src.data.dataset_loader - Total samples: 1145
INFO - src.data.dataset_loader - Train samples: 801
INFO - src.data.dataset_loader - Validation samples: 229
INFO - src.data.dataset_loader - Test samples: 115


In [5]:
print(f"\nDataset sizes:")
print(f"  Train: {len(train_ds)}")
print(f"  Valid: {len(valid_ds)}")
print(f"  Test: {len(test_ds)}")


Dataset sizes:
  Train: 801
  Valid: 229
  Test: 115


In [17]:
prompt_lengths = np.array([len(ex['prompt'].split()) for ex in loader.full_dataset])
chosen_lengths = np.array([len(ex['chosen'].split()) for ex in loader.full_dataset])
rejected_lengths = np.array([len(ex['rejected'].split()) for ex in loader.full_dataset])

print(f"Prompt:")
print(f"  Mean:   {np.mean(prompt_lengths):.2f}")

print(f"Chosen responses:")
print(f"  Mean:   {np.mean(chosen_lengths):.2f}")

print(f"Rejected responses:")
print(f"  Mean:   {np.mean(rejected_lengths):.2f}")

Prompt:
  Mean:   15.03
Chosen responses:
  Mean:   28.23
Rejected responses:
  Mean:   21.99
