# **Building the State of the art models to perform analysis.**

# **Installing the transformers to build models**

In [1]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

[K     |████████████████████████████████| 5.5 MB 31.7 MB/s 
[K     |████████████████████████████████| 451 kB 61.2 MB/s 
[K     |████████████████████████████████| 118 kB 81.0 MB/s 
[K     |████████████████████████████████| 65 kB 4.8 MB/s 
[K     |████████████████████████████████| 182 kB 65.0 MB/s 
[K     |████████████████████████████████| 212 kB 3.6 MB/s 
[K     |████████████████████████████████| 132 kB 54.1 MB/s 
[K     |████████████████████████████████| 127 kB 74.1 MB/s 
[K     |████████████████████████████████| 50 kB 6.6 MB/s 
[K     |████████████████████████████████| 378 kB 66.3 MB/s 
[K     |████████████████████████████████| 139 kB 83.3 MB/s 
[K     |████████████████████████████████| 94 kB 3.5 MB/s 
[K     |████████████████████████████████| 357 kB 77.3 MB/s 
[K     |████████████████████████████████| 2.3 MB 64.5 MB/s 
[K     |████████████████████████████████| 7.6 MB 66.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 61.3 MB/s 
[?25h  Building wheel for roug

In [2]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In the next step we will be downloading the dataset and will see the accuracy and the pre-trained models working on this dataset.

In [3]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", version="3.0.0")

print(f"Features in cnn_dailymail : {dataset['train'].column_names}")

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/default to /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Features in cnn_dailymail : ['article', 'highlights', 'id']


In [4]:
#In this step we will be printing the sample text data and the lenght of the text. 
sample = dataset["train"][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])


Article (excerpt of 500 characters, total length: 4051):

Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s

Summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


In [5]:
sample_text = dataset["train"][1]["article"][:1000]

# We'll collect the generated summaries of each model in a dictionary
summaries = {}

In [6]:
# creating a function to combine the three line in a text data and returns the combined text
def baseline_summary_three_sent(text):
    return "\n".join(sent_tokenize(text)[:3])

In [7]:
summaries['baseline'] = baseline_summary_three_sent(sample_text)

summaries['baseline']

'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.\nHere, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.\nMIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."'

# **GPT-2 Summarization**

By appending the TL;DR at the end of the input text to the GPT-2, it will help in generating the summarized texts.

Here, TL;DR is "too long; didnt read". 


So, to perform the GPT-2 summarization firts we will create a text generation pipelineand then load GPT-2 model into the pipeline. 

In [8]:
from transformers import pipeline, set_seed

set_seed(42)

pipe = pipeline('text-generation', model = 'gpt2-medium' )

gpt2_query = sample_text + "\nTL;DR:\n"

pipe_out = pipe(gpt2_query, max_length = 512, clean_up_tokenization_spaces = True)

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [9]:
#printing the given text in the dataset
pipe_out

[{'generated_text': 'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they\'re ready to appear in court. Most often, they face drug charges or charges of assaulting an officer --charges that Judge Steven Leifman says are usually "avoidable felonies." He says the arrests often result from confrontations with police. Mentally ill people often won\'t do what they\'re told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid, delusional, and

In [10]:
#printing the summarized text in the dataset. 
pipe_out[0]["generated_text"][len(gpt2_query) :]

'MIAMI-DADE, Florida | April 13, 2012 -- Some inmates are locked up in solitary confinement, where they must be isolated from the world for six months before they\'re released onto the general jail population to participate in other inmates. Others are housed in "theforgotten floor," the top floor of the pretrial facility that is also used by criminal offenders. What makes these inmates separate is their medical needs. They\'re given special medications to treat their mental illness, but it\'s only in those cases where necessary to take certain medications themselves for the very dangerous condition. These medications can lead to anaphylaxis or dangerous reactions if taken by people who are on them alone. A mentally ill person like John Brown, a convicted felon with severe mental illness. He spends four months in the "forgotten floor" and is charged with being a felon, disorderly conduct, assault with a deadly weapon (a gun), possession of child pornography, resisting arrest, and posse

In [11]:
summaries['gpt2'] = "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

# **T-5 summarization**

T5 is said to be Text-To-Transfer-Transformer. T5 transform is a pretarined model which takes text as input and return modefied text as the output, whereas BERT model can onlu output a class label or a span of input. 

T5 model is used in performing Summarization, Question-Answering, Machine Translation and classifiaction problems.

In [12]:
#downloading the T-5 model and adding the model to the pipeline. 

pipe = pipeline('summarization', model = 't5-small' )

#Printing the sample text data in the dataset
pipe_out = pipe(sample_text)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [13]:
#Printing the summarized text
pipe_out

[{'summary_text': "inmates with the most severe mental illnesses are incarcerated until they're ready to appear in court . most often, they face drug charges or charges of assaulting an officer . mentally ill people become more paranoid, delusional, and less likely to follow dir ."}]

In [14]:
summaries['t5'] = 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

# **BART Summarization**

BART is a denoising autoencoder for pretraining sequence-to-sequence models. It is trained by :

1) Corrupting text with an arbitrary noising function. 

2) LEarning a model to reconstruct teh original text.

BART uses a standard seq2aeq/NMT architecture with a biderectional encoder.

This model can be used for question-answering, machine translation, text summarization, sequence classification. 

In [15]:
# Addinh the model to the pipeline 
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [16]:
# printing the summarized text 
pipe_out

[{'summary_text': 'Miami-Dade pretrial detention facility is dubbed the "forgotten floor" Here, inmates with the most severe mental illnesses are incarcerated. Most often, they face drug charges or charges of assaulting an officer. Judge Steven Leifman says the arrests often result from confrontations with police.'}]

In [17]:
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [18]:
summaries["bart"]

'Miami-Dade pretrial detention facility is dubbed the "forgotten floor" Here, inmates with the most severe mental illnesses are incarcerated.\nMost often, they face drug charges or charges of assaulting an officer.\nJudge Steven Leifman says the arrests often result from confrontations with police.'

# **Pegasus**

Pegasus is a pretarined model which released by GOOGLE. This model pretraining is very similar to summarization, that is, important sentences are removed and marked from an input, and later generated together as one output sequence from the remaining sentences. 

In [19]:
# Adding the Pegasus model to the pipe line and printing the sample text
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail"  )

pipe_out = pipe(sample_text)

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [20]:
pipe_out

[{'summary_text': 'Mentally ill inmates are housed on the "forgotten floor" of a Miami jail .<n>Judge Steven Leifman says the charges are usually "avoidable felonies"<n>He says the arrests often result from confrontations with police .<n>Mentally ill people often won\'t do what they\'re told when police arrive on the scene .'}]

In [21]:
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .", ".\n")

**Comparing different summaries**

In [22]:
# Here we will be comparing the different pretrained models that 
# we added to the pipeline and their outputs
print("GROUND TRUTH")

print(dataset['train'][1]['highlights'])


for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])


GROUND TRUTH
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .
BASELINE
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.
Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.
MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."
GPT2
MIAMI-DADE, Florida | April 13, 2012 -- Some inmates are locked up in solitary confinement, where they must be isolated from the world for six months before they're released onto the general jail population to participat

# **Calculating the performance of the models added in the pipeline**


# **SacreBLEU**

In [23]:
#Downloading the SacreBleu metric and loading the metric in a variable. 
from datasets import load_metric

bleu_metric = load_metric("sacrebleu")

  bleu_metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [24]:
# Predicting the scores for pegasus model
bleu_metric.add(prediction = [summaries["pegasus"]], reference = [dataset['train'][1]['highlights'] ])

results = bleu_metric.compute(smooth_method = 'floor', smooth_value = 0 )

results['precision'] = [np.round(p , 2) for p in results['precisions'] ]

pd.DataFrame.from_dict(results, orient = 'index', columns = ['Value'] )

Unnamed: 0,Value
score,18.456308
counts,"[29, 16, 11, 7]"
totals,"[76, 75, 74, 73]"
precisions,"[38.1578947368421, 21.333333333333332, 14.8648..."
bp,1.0
sys_len,76
ref_len,57
precision,"[38.16, 21.33, 14.86, 9.59]"


# **ROUGE**

# **ROUGE vs BLEU**

BLEU is used to measure the precision of the model, whereas Rouge is used to measure the recall of the model.

Precision is the number of words in the machine generated summaries that are appeared in the human reference summaries. 

Recall is the number of words in the human reference summaries that appeared in the machine generated summaries. 




In [25]:
#Downloading the Rouge metric and loading the metric in a variable. 
rouge_metric = load_metric('rouge')

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [26]:
# Predicting the rouge scores for all the models
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

reference = dataset['train'][1]['highlights']

records = []

for model_name in summaries:
    rouge_metric.add(prediction = summaries[model_name], reference = reference )
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
    print('rouge_dict ', rouge_dict )
    records.append(rouge_dict)

pd.DataFrame.from_records(records, index = summaries.keys() )

rouge_dict  {'rouge1': 0.365079365079365, 'rouge2': 0.14516129032258066, 'rougeL': 0.20634920634920634, 'rougeLsum': 0.2857142857142857}
rouge_dict  {'rouge1': 0.1836734693877551, 'rouge2': 0.0410958904109589, 'rougeL': 0.10204081632653061, 'rougeLsum': 0.17006802721088435}
rouge_dict  {'rouge1': 0.1758241758241758, 'rouge2': 0.0, 'rougeL': 0.13186813186813187, 'rougeLsum': 0.15384615384615383}
rouge_dict  {'rouge1': 0.3655913978494624, 'rouge2': 0.13186813186813184, 'rougeL': 0.2150537634408602, 'rougeLsum': 0.3225806451612903}
rouge_dict  {'rouge1': 0.49019607843137253, 'rouge2': 0.24000000000000002, 'rougeL': 0.3529411764705882, 'rougeLsum': 0.4509803921568628}


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.365079,0.145161,0.206349,0.285714
gpt2,0.183673,0.041096,0.102041,0.170068
t5,0.175824,0.0,0.131868,0.153846
bart,0.365591,0.131868,0.215054,0.322581
pegasus,0.490196,0.24,0.352941,0.45098


# **Evaluating on the TEST set of the Dataset**. 

In [27]:
# building a function to process the rouge scores of the baseline model for the test dataset. 
def calculate_metric_on_baseline_test_ds(dataset, metric, column_text = 'article', column_summary = 'highlights' ):
    summaries = [baseline_summary_three_sent(text) for text in dataset[column_text] ]

    metric.add_batch(predictions = summaries, references = dataset[column_summary] )

    score = metric.compute()
    return score

In [28]:
test_sampled = dataset['train'].shuffle(seed = 42).select(range(1000))

score = calculate_metric_on_baseline_test_ds(test_sampled, rouge_metric )

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame.from_dict(rouge_dict, orient = 'index' , columns = ['baseline'] ).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.253995,0.100642,0.165754,0.231571


# **Strategy to calculate the ROUGE Metric on test dataset for the other models**

In [29]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [30]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

score = calculate_metric_on_test_ds(test_sampled, rouge_metric, 
                                   model_pegasus, tokenizer, batch_size=8)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

# At the end, we compute and return the ROUGE scores.
pd.DataFrame(rouge_dict, index=["pegasus"])

100%|██████████| 125/125 [25:28<00:00, 12.23s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.013243,0.000691,0.013145,0.013185
