## Text Summarization

In [1]:
from datasets import load_dataset, Dataset

# Stream and take only the first 20_000 samples from the Spanish and English datasets
spanish_stream = load_dataset("intfloat/multilingual_cc_news", "es", split="train", streaming=True).take(20000)
english_stream = load_dataset("intfloat/multilingual_cc_news", "en", split="train", streaming=True).take(20000)

# Convert streams to datasets
spanish_dataset = Dataset.from_list(list(spanish_stream))
english_dataset = Dataset.from_list(list(english_stream))

english_dataset

README.md: 0.00B [00:00, ?B/s]

multilingual_cc_news.py: 0.00B [00:00, ?B/s]

Dataset({
    features: ['title', 'maintext', 'url', 'date_publish'],
    num_rows: 20000
})

In [2]:
# Convert to DatasetDict
from datasets import DatasetDict

def split_dataset(dataset, train_size=15_000, val_size=2_500, test_size=2_500):
    train_dataset = dataset.train_test_split(
        train_size=train_size, 
        seed=42)
    val_dataset = train_dataset['test'].select(range(val_size))
    test_dataset = train_dataset['test'].select(range(val_size, val_size + test_size))
    return DatasetDict({
        'train': train_dataset['train'],
        'validation': val_dataset,
        'test': test_dataset
    })
    
spanish_dataset = split_dataset(spanish_dataset)
english_dataset = split_dataset(english_dataset)
english_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'maintext', 'url', 'date_publish'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['title', 'maintext', 'url', 'date_publish'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['title', 'maintext', 'url', 'date_publish'],
        num_rows: 2500
    })
})

In [7]:
# Show a few samples
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset['train'].shuffle(seed=seed).select(range(num_samples))
    for i, item in enumerate(sample):
        print(f"Title: {item['title']}")
        print(f"Maintext (first 200 words): {item['maintext'][:200]}\n")
show_samples(english_dataset)

Title: Victim of attempted abduction near RAF Marham to relive ordeal for Crimewatch
Maintext (first 200 words): A serviceman at the centre of an attempted abduction near RAF Marham is to relive his ordeal on tomorrow night’s BBC Crimewatch programme.
The victim, a married airman in his late 20s, has revealed he

Title: Saipan delegation OKs San Roque rezoning
Maintext (first 200 words): AFTER a lengthy discussion, the Saipan and Northern Islands Legislative Delegation on Tuesday passed a local measure amending the Saipan Zoning Law to rezone parts of San Roque.
Fifteen members voted 

Title: An unusual new late-night competitor for ESPN
Maintext (first 200 words): NEW YORK (AP) — To a certain segment of the population, Scott Van Pelt is a more popular late-night television star than Jimmy Fallon, Jimmy Kimmel and Stephen Colbert.
That segment — young men aged 1



In [8]:
# Concatenate the above two datasets (and rename it to be the same as in the LLM course)
from datasets import concatenate_datasets, DatasetDict
books_dataset = DatasetDict()

for split in english_dataset.keys():
    books_dataset[split] = concatenate_datasets(
        [english_dataset[split], spanish_dataset[split]])
    books_dataset[split] = books_dataset[split].shuffle(seed=42)
    
show_samples(books_dataset)

Title: Declararán el 21 de octubre como Día de Héctor Espino
Maintext (first 200 words): HERMOSILLO, Sonora(GH)
En la sesiÃ³n ordinaria correspondiente a septiembre, el Cabildo aprobÃ³ por unanimidad la propuesta de declarar el 21 de octubre como el DÃ­a de HÃ©ctor Espino, en honor al leg

Title: Tropical Storm Lisa forms over eastern tropical Atlantic
Maintext (first 200 words): 

Title: Earthquake felt from Nebraska to Texas
Maintext (first 200 words): PAWNEE, Okla. (AP) — One of Oklahoma’s largest earthquakes on record rattled other parts of the Midwest on Saturday from Nebraska to North Texas, and likely will turn new attention to the practice of 



In [9]:
# Filter out samples with empty title or maintext
def filter_empty_samples(dataset):
    return dataset.filter(lambda x: len(x['title'].strip()) > 0 and len(x['maintext'].strip()) > 0)
books_dataset = DatasetDict({
    split: filter_empty_samples(books_dataset[split]) for split in books_dataset.keys()
})
books_dataset

Filter:   0%|          | 0/30000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'maintext', 'url', 'date_publish'],
        num_rows: 28241
    })
    validation: Dataset({
        features: ['title', 'maintext', 'url', 'date_publish'],
        num_rows: 4701
    })
    test: Dataset({
        features: ['title', 'maintext', 'url', 'date_publish'],
        num_rows: 4699
    })
})

In [10]:
show_samples(books_dataset)

Title: Shimon Peres witnessed Israel’s history, and shaped it
Maintext (first 200 words): JERUSALEM (AP) — At every corner of Israel’s tumultuous history, Shimon Peres was there.
He was a young aide to the nation’s founding fathers when the country declared independence in 1948, and he pla

Title: Police: Boy, 2, dies after accidentally shooting self in chest
Maintext (first 200 words): QUAKERTOWN, Pa. — State police say a 2-year-old boy accidentally shot himself to death in a Pennsylvania home.
Authorities say the boy was pronounced dead Monday shortly after suffering a single gunsh

Title: Chiefs' Jamaal Charles doubtful for Week 2 at Houston
Maintext (first 200 words): Chiefs running back Jamaal Charles is doubtful for Sunday's game at Houston as he continues to work his way back from surgery last season to repair the torn ACL in his right knee.

