In [1]:
!pip install transformers pandas tqdm

import json
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer




In [2]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'preference/train.jsonl.gz', 'validation': 'preference/validation.jsonl.gz'}
df = pd.read_json("hf://datasets/nvidia/HelpSteer3/" + splits["train"], lines=True)
df = df[~df["domain"].isin(["code", "multilingual"])]


In [3]:
# Extract reasoning from individual preferences
def extract_reasoning(entry):
  return [entry["individual_preference"][i]["reasoning"] for i in range(len(entry["individual_preference"]))]

def extract_context(entry):
  return [entry["context"][i]["content"] for i in range(len(entry["context"]))]

df["reasoning_list"] = df.apply(lambda row: extract_reasoning(row), axis=1)
df["context_list"] = df.apply(lambda row: extract_context(row), axis=1)

print("\nExtracted Reasoning Samples:\n", df.iloc[1]["reasoning_list"])
print("\nExtracted Context Samples:\n", df.iloc[1]["context_list"])



Extracted Reasoning Samples:
 ['@Response 2 is better than @Response 1. @Response 1 misses one of the prompt instructions. @Response 2 follows all instructions and is a well-written response.', '@Response 2 is better than @Response 1. @Response 2 follows the instruction more closely; it is missing one instruction, while @Response 1 is missing two.', "@Response 2 is better than @Response 1 as it provides a more complete and satisfactory story that fits all of the user's detailed requests within the narrative."]

Extracted Context Samples:
 ['please write a detailed and comedic fictional Hades story about Zagreus waking up in the morning and going to a river for a bath. He goes au naturel (compliment his butt, face, physique) and enters the water. Describe how he looks as he happily scrubs himself clean. Suddenly he feels a fart coming on. describe how he looks as he relieves his gut pain by forcing out a fart into the water, enjoying the relief. The water around him now smells so foul 

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

def tokenize_text(texts):
  return tokenizer(texts, truncation=True, padding="max_length", max_length=512, return_tensors="pt")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
tokenized_data = {
    "contexts": [],
    "response1": [],
    "response2": [],
    "overall_preference": [],
    "reasonings": []
}

df_subset = df.head(5000)

for _, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
  # Tokenize all available contexts
  tokenized_contexts = [tokenize_text(ctx) for ctx in row["context_list"]]

  # Tokenize reasoning
  tokenized_reasonings = [tokenize_text(reason) for reason in row["reasoning_list"]]

  # Store data
  tokenized_data["contexts"].append(tokenized_contexts)
  tokenized_data["response1"].append(tokenize_text(row["response1"]))
  tokenized_data["response2"].append(tokenize_text(row["response2"]))
  tokenized_data["overall_preference"].append(row["overall_preference"])
  tokenized_data["reasonings"].append(tokenized_reasonings)

# Convert to DataFrame
tokenized_df = pd.DataFrame(tokenized_data)

# Show sample
tokenized_df.head()


100%|██████████| 5000/5000 [01:14<00:00, 66.84it/s] 


Unnamed: 0,contexts,response1,response2,overall_preference,reasonings
0,"[[input_ids, attention_mask], [input_ids, atte...","[input_ids, attention_mask]","[input_ids, attention_mask]",2,"[[input_ids, attention_mask], [input_ids, atte..."
1,"[[input_ids, attention_mask]]","[input_ids, attention_mask]","[input_ids, attention_mask]",2,"[[input_ids, attention_mask], [input_ids, atte..."
2,"[[input_ids, attention_mask]]","[input_ids, attention_mask]","[input_ids, attention_mask]",2,"[[input_ids, attention_mask], [input_ids, atte..."
3,"[[input_ids, attention_mask]]","[input_ids, attention_mask]","[input_ids, attention_mask]",-3,"[[input_ids, attention_mask], [input_ids, atte..."
4,"[[input_ids, attention_mask], [input_ids, atte...","[input_ids, attention_mask]","[input_ids, attention_mask]",2,"[[input_ids, attention_mask], [input_ids, atte..."


In [6]:
import torch
torch.save(tokenized_data, "tokenized_helpsteer3.pt")
