In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (6

In [None]:
from datasets import load_dataset, load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd

dataset = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print("The Structure of the dataset:")
print(dataset)

The Structure of the dataset:
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [None]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize summaries
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def summarize_text(text, model, tokenizer, max_length=150, min_length=40, num_beams=4):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=num_beams, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
articles = dataset['train']['article'][:5]
highlights = dataset['train']['highlights'][:5]

In [None]:
# Create lists to store the results
data = {
    'Original Article': [],
    'Original Highlights': [],
    'Generated Summary': []
}

In [None]:
# Iterate over articles and highlights
for article, highlight in zip(articles, highlights):
    data['Original Article'].append(article)
    data['Original Highlights'].append(highlight)
    generated_summary = summarize_text(article, model, tokenizer)
    data['Generated Summary'].append(generated_summary)

In [None]:
# Create a DataFrame from the data
df_results = pd.DataFrame(data)

In [34]:
df_results

Unnamed: 0,Original Article,Original Highlights,Generated Summary
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,the young actor says he has no plans to fritte...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,inmates with most severe mental illnesses are ...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...","survivor: ""i probably had a 30-, 35-foot free ..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",the polyps were removed and sent to the nation...
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...","new: ""your admitted conduct was not only illeg..."


In [None]:
# Function to summarize text for evaluation
def summarize_text_eval(text, model, tokenizer, max_length=300, min_length=40):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [28]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [31]:
import evaluate
def evaluate_model(model, tokenizer, dataset, num_samples=50):
    articles = dataset['validation']['article'][:num_samples]
    references = dataset['validation']['highlights'][:num_samples]

    summaries = [summarize_text_eval(article, model, tokenizer) for article in articles]

    # Load ROUGE metric
    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    results = rouge.compute(predictions=summaries, references=references)
    return results

In [33]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=fc2bfd3dde028242609176380917b989ad0f5375340715f7d683681fe70e2b1a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [37]:
import pandas as pd
import evaluate
results = evaluate_model(model, tokenizer, dataset, num_samples=50)

In [43]:
def format_results(results):
    import pandas as pd

    # Extract scores from nested structure if necessary
    rouge1_score = results['rouge1']['mid']['fmeasure'] if isinstance(results['rouge1'], dict) else results['rouge1']
    rouge2_score = results['rouge2']['mid']['fmeasure'] if isinstance(results['rouge2'], dict) else results['rouge2']
    rougeL_score = results['rougeL']['mid']['fmeasure'] if isinstance(results['rougeL'], dict) else results['rougeL']

    # Create a DataFrame with the appropriate structure
    df_results = pd.DataFrame({
        'Metric': ['ROUGE-1', 'ROUGE-2', 'ROUGE-L'],
        'Score': [
            rouge1_score,
            rouge2_score,
            rougeL_score
        ]
    })

    return df_results



In [44]:
# Format and print the results
df_eval_results = format_results(results)
print(df_eval_results)

    Metric     Score
0  ROUGE-1  0.306527
1  ROUGE-2  0.142991
2  ROUGE-L  0.242000


In [45]:
df_eval_results

Unnamed: 0,Metric,Score
0,ROUGE-1,0.306527
1,ROUGE-2,0.142991
2,ROUGE-L,0.242
