In [1]:
import json
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def prepare_daily_mail_cnn_data():
    """
    Prepare Daily Mail and CNN data for fine-tuning
    You can use the CNN/Daily Mail dataset from HuggingFace
    """
    
    # Option 1: Load from HuggingFace (recommended)
    print("Loading CNN/Daily Mail dataset...")
    dataset = load_dataset("cnn_dailymail", "default")
    
    train_data = []
    for example in dataset['train']:
        # Clean and prepare the data
        article = example['article'].strip()
        summary = example['highlights'].strip()
        
        # Skip very short articles or summaries
        if len(article.split()) < 50 or len(summary.split()) < 10:
            continue
            
        # Limit article length to fit in context window
        if len(article.split()) > 1500:  # Leave room for summary
            article = ' '.join(article.split()[:1500])
        
        train_data.append({
            'article': article,
            'summary': summary
        })
    
    # Take a subset for training (adjust size based on your needs)
    train_data = train_data[:10000]  # Use 5k samples for faster training
    
    # Save to JSON
    with open('daily_mail_cnn_train.json', 'w') as f:
        json.dump(train_data, f, indent=2)
    
    print(f"Prepared {len(train_data)} training examples")
    print("Sample:")
    print(f"Article length: {len(train_data[0]['article'].split())} words")
    print(f"Summary length: {len(train_data[0]['summary'].split())} words")
    
    return train_data

In [3]:
if __name__ == "__main__":
    # Use HuggingFace dataset
    data = prepare_daily_mail_cnn_data()

Loading CNN/Daily Mail dataset...
Prepared 10000 training examples
Sample:
Article length: 198 words
Summary length: 37 words
