In [3]:
from datasets import load_dataset
import pandas as pd
import random
import os
import json

# Load the CNN/DailyMail dataset (we use the '3.0.0' version which includes highlights as summaries)
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Parameters
NUM_TRAIN = 128  # Keep this small
NUM_TEST = 32
OUTPUT_DIR = "subsampled_cnn_dailymail"

# Create output folder
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Subsample
train_subset = dataset["train"].shuffle(seed=42).select(range(NUM_TRAIN))
test_subset = dataset["test"].shuffle(seed=42).select(range(NUM_TEST))

# Save to JSON (list of dicts)
train_data = [{"article": ex["article"], "summary": ex["highlights"]} for ex in train_subset]
test_data = [{"article": ex["article"], "summary": ex["highlights"]} for ex in test_subset]

with open(f"{OUTPUT_DIR}/train_sample.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(f"{OUTPUT_DIR}/test_sample.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

print(f"Saved subsampled train and test sets to: {OUTPUT_DIR}/")


Saved subsampled train and test sets to: subsampled_cnn_dailymail/
