# This notebook focuses on preprocessing the HelpSteer dataset for training:

### Dataset Loading and Analysis:

Loads data from "cleaned_helpsteer_subset.json"

1.   Loads data from "cleaned_helpsteer_subset.json"
2.   Analyzes preference distributions to understand how often response 1 or 2 is preferred
3. Extracts reasoning scores from individual annotations


### Tokenization and Formatting:


1.   Uses DistilBERT tokenizer to prepare the text data
2.   Structures context with proper separators
3. Tokenizes contexts with 512 max tokens and responses with 384 max tokens
4. Converts preference values to binary labels (0 for preference toward response 1, 1 for response 2)


### Train/Validation Split:

Creates a stratified 80/20 split to ensure balanced distribution of preferences

Saves the tokenized data and indices to disk for later use

In [None]:
import torch
import numpy as np
from sklearn.model_selection import train_test_split

# Load the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

# Load the dataset
print("Loading dataset...")
df = pd.read_json("cleaned_helpsteer_subset.json", lines=True)

# Analyze label distribution
print("Analyzing label distribution...")
preference_counts = df['overall_preference'].value_counts()
print(f"Label distribution:\n{preference_counts}")
print("Preference distribution (negative = response 1, positive = response 2):")
for val in sorted(preference_counts.index):
    print(f"  {val}: {preference_counts[val]} examples ({preference_counts[val]/len(df)*100:.2f}%)")

# Calculate overall preference direction
neg_count = sum(preference_counts.get(val, 0) for val in [-3, -2, -1])
pos_count = sum(preference_counts.get(val, 0) for val in [1, 2, 3])
neutral_count = preference_counts.get(0, 0)
print(f"\nOverall: {neg_count} response_1 preferred ({neg_count/len(df)*100:.2f}%)")
print(f"Overall: {pos_count} response_2 preferred ({pos_count/len(df)*100:.2f}%)")
print(f"Overall: {neutral_count} neutral ({neutral_count/len(df)*100:.2f}%)")

# Extract and analyze reasoning scores
print("Analyzing reasoning scores...")
def get_reasoning_score(row):
    scores = [int(item['score']) for item in row['individual_preference'] if 'score' in item]
    return np.mean(scores) if scores else 0

df['reasoning_score'] = df.apply(get_reasoning_score, axis=1)
print(f"Mean reasoning score: {df['reasoning_score'].mean()}")
print(f"Min reasoning score: {df['reasoning_score'].min()}")
print(f"Max reasoning score: {df['reasoning_score'].max()}")

# Function to tokenize text with improved formatting
def tokenize_text(text, max_length=384):
    return tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )

# Function to combine context in a more structured way
def format_context(context_list):
    # Use tokenizer's special tokens to separate context parts
    return tokenizer.sep_token.join(context_list)

# Prepare improved tokenized data
print("Tokenizing data with improved formatting...")
tokenized_data = {
    "response1": [],
    "response2": [],
    "contexts": [],
    "overall_preference": [],
    "reasoning_scores": [],  # Add reasoning scores
    "reasoning_text": []
}

for _, row in tqdm(df.iterrows(), total=len(df)):
    # Format context with proper separators
    context = format_context(row["context_list"])

    # Tokenize with appropriate lengths
    # Allocate more tokens to context, less to responses
    tokenized_data["contexts"].append(tokenize_text(context, max_length=512))
    tokenized_data["response1"].append(tokenize_text(row["response1"], max_length=384))
    tokenized_data["response2"].append(tokenize_text(row["response2"], max_length=384))

    # Convert preference to binary based on numeric values
    # Negative values indicate response_1 is preferred, positive values indicate response_2 is preferred
    preference_value = row["overall_preference"]
    binary_preference = 0 if preference_value < 0 else 1 if preference_value > 0 else (0.5 if preference_value == 0 else 0)
    tokenized_data["overall_preference"].append(binary_preference)

    # Store reasoning information
    reasoning_text = " ".join([r["reasoning"] for r in row["individual_preference"] if "reasoning" in r])
    tokenized_data["reasoning_text"].append(reasoning_text)
    tokenized_data["reasoning_scores"].append(row.get('reasoning_score', 0))

# Create train/val splits with stratification
print("Creating train/validation splits...")
indices = list(range(len(tokenized_data["contexts"])))
labels = tokenized_data["overall_preference"]

train_indices, val_indices = train_test_split(
    indices,
    test_size=0.2,
    random_state=42,
    stratify=labels  # Ensure balanced distribution
)

# Save train/val indices for later use
indices_data = {
    "train_indices": train_indices,
    "val_indices": val_indices
}

# Save the tokenized data
print("Saving tokenized data...")
torch.save(tokenized_data, "tokenized_helpsteer_improved.pt")
torch.save(indices_data, "helpsteer_indices.pt")

print("Data preparation complete!")

# Print some statistics about the processed data
print(f"Total examples: {len(tokenized_data['contexts'])}")
print(f"Training examples: {len(train_indices)}")
print(f"Validation examples: {len(val_indices)}")

# Check a sample to verify tokenization
sample_idx = 0
sample_context = tokenizer.decode(tokenized_data["contexts"][sample_idx]["input_ids"][0], skip_special_tokens=False)
print("\nSample tokenized context (with special tokens):")
print(sample_context[:500] + "..." if len(sample_context) > 500 else sample_context)

Loading dataset...
Analyzing label distribution...
Label distribution:
overall_preference
-2    4560
 2    4537
 1    3084
-1    2848
 3    2323
-3    2271
 0     377
Name: count, dtype: int64
Preference distribution (negative = response 1, positive = response 2):
  -3: 2271 examples (11.36%)
  -2: 4560 examples (22.80%)
  -1: 2848 examples (14.24%)
  0: 377 examples (1.88%)
  1: 3084 examples (15.42%)
  2: 4537 examples (22.68%)
  3: 2323 examples (11.62%)

Overall: 9679 response_1 preferred (48.39%)
Overall: 9944 response_2 preferred (49.72%)
Overall: 377 neutral (1.88%)
Analyzing reasoning scores...
Mean reasoning score: 0.02179166666666666
Min reasoning score: -3.0
Max reasoning score: 3.0
Tokenizing data with improved formatting...


100%|██████████| 20000/20000 [01:36<00:00, 206.68it/s]


Creating train/validation splits...
Saving tokenized data...
Data preparation complete!
Total examples: 20000
Training examples: 16000
Validation examples: 4000

Sample tokenized context (with special tokens):
[CLS] rephrase “ a word about robot software “ [SEP] let ' s discuss the topic of robot software. [SEP] rephrase the future of robotics and robots [SEP] the potential developments and advancements in the field of robotics and robots. [SEP] make it simple and short [SEP] the future of robotics and robots. [SEP] another one [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P...
