In [None]:
from datasets import load_dataset
from huggingface_hub import login

login("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")

# Load with streaming
dataset = load_dataset("TheBlueScrubs/TheBlueScrubs-v1", split="train", streaming=True)

# Shuffle the stream before sampling
dataset = dataset.shuffle(seed=42, buffer_size=10000)

# Collect 50k balanced-ish samples
samples = []
for i, example in enumerate(dataset):
    if i >= 50000:
        break
    samples.append(example)

print("✅ Collected", len(samples), "examples")
print(samples[0])


In [None]:
import json
import random
from google.colab import files

# Shuffle for better splits
random.shuffle(samples)

# Train/val/test split
train_size = int(0.8 * len(samples))
val_size = int(0.1 * len(samples))

train_data = samples[:train_size]
val_data = samples[train_size:train_size+val_size]
test_data = samples[train_size+val_size:]

# Save as JSONL
def save_json(filename, data):
    with open(filename, "w", encoding="utf-8") as f:
        for ex in data:
            json.dump(ex, f)
            f.write("\n")

# Save with better naming
save_json("bluescrubs_train.jsonl", train_data)
save_json("bluescrubs_val.jsonl", val_data)
save_json("bluescrubs_test.jsonl", test_data)

print("✅ Saved splits:",
      len(train_data), "train /",
      len(val_data), "val /",
      len(test_data), "test")

# Download files
files.download("bluescrubs_train.jsonl")
files.download("bluescrubs_val.jsonl")
files.download("bluescrubs_test.jsonl")


In [None]:
import json
from collections import Counter
import statistics

# Pick which file to test (train/val/test)
file_path = "bluescrubs_train.jsonl"   # or "bluescrubs_val.jsonl" / "bluescrubs_test.jsonl"

# Load the JSONL file
records = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

print(f"✅ Loaded {len(records)} records from {file_path}")

# --- Check unique sources ---
sources = [ex.get("meta", {}).get("source", "unknown") for ex in records]
print("Top sources:", Counter(sources).most_common(10))

# --- Check probability distribution ---
probs = [float(ex.get("meta", {}).get("probability", 0)) for ex in records]
print("Mean:", statistics.mean(probs), "Min:", min(probs), "Max:", max(probs))
