In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'tokenizer:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4854037%2F8195139%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240423%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240423T154520Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D80301151d9332d00c238469b5ae4be241b13f241d222895eac81365e424e32eca181c8f7eb536ec7b4005c53ebc6f1c8022f04d46462eb555b1077a7784e1d9f4c6594101447edad9c11f78003707308f063ee2c5a09aa65a9fe356351a8e7b42e98ac0ac69cceb4566fc0f22f96c94b68f5735c9fddf6fdbafb3580e3e27edf97a555be7457e6ffe3fe92e291497bda86b59f905ea3e0f72256cf7f0b170b39d0b22fdc4d0117882130a15993e1654e3bb024fe53b6be630c25261efd5f265aaeaa2cea1e8997a0a7bbba38da7324e49ab099309287f97117356ac8a5a01728a112fcd44605c3ff1cde21b1971cbda8306b25e1fa630d3081a87c167041a58a,cnn3000:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4854264%2F8195441%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240423%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240423T154520Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3f17747749ab14a25de36ae7b9736e5cd7f79a8f2788c86826d8a4100098a62476f251014e2376167792e283e9a9128b624656ce5b10c58f1ce1778d16884daf0633f5a6e77b9b33c55267b9102b872955a66a1427bafaf9881cfa01dd96e8bb51707f2554be03d7d3c85276efd087558b43132eee95715d67e1a5b557ed596d120589dc3e7d3567dc3f3abdd780e6e9748c74c0937dd82155318e2eaf2ae4bf71050fdfe95405d2b8c4640f23a6f4300a1bd0057f6d66c8079731efeaffc3a90fce7a4249e0874722b84cdf52e27fdb63fa72ff4b2c3e1dccbc15cfa334204b11cad761c5b43ff9f1c57482c0e9cb0eb0871d0f0c40f73c31fb459785d6ffa4,pegasus_model_2/other/peagasus_model/1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F30566%2F36279%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240423%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240423T154520Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4e45d5887c143461aefb5cb40528d180ea54bf7b4bffcf5d987c36af89be22ba3b7c9f821093bd793dad10be925892a8c6a7520071c86b41adfb335197af4ffeaf10c0eb425c56234f2df5eaf5fa7bd3e6225048daa74ebe510882e71066dd0dc3428020a4bb32c6ddd0802e19ea0160951ad2b26e16f528ed9ef00a27eafc3310309cae89ead366b66a4f300cba4ad0033a57dc99840d27df5029ad15b15756ce942ba4552e7527414cbcc60ffe97bc002546f744a90d5a4d7835b4a04e5eb29b278253af579012b2d139f5135dd032df3ea8d47408034725ce3a9243def896b75d03d2767c63d2d318807ea0df5ef9d7930f71cf15dbfde60268b58b43979c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q


In [None]:

from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

2024-04-23 12:39:05.327489: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 12:39:05.327589: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 12:39:05.439301: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.

To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.

You will be able to reuse this secret in all of your notebooks.

Please note that authentication is recommended but still optional to access public models or datasets.


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# from datasets import load_dataset

dataset = load_dataset("samsum")

Downloading data: 100%|██████████| 6.06M/6.06M [00:00<00:00, 24.3MB/s]
Downloading data: 100%|██████████| 347k/347k [00:00<00:00, 2.38MB/s]
Downloading data: 100%|██████████| 335k/335k [00:00<00:00, 3.69MB/s]


Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

dataset_samsum_pt = dataset.map(convert_examples_to_features, batched = True)

In [None]:
dataset_samsum_pt['test']

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 819
})

In [None]:
train_dataset=dataset_samsum_pt["train"].select(range(8500))
print(train_dataset)

Dataset({

    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],

    num_rows: 8500

})


In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
!pip install transformer -U
!pip install accelerate -U

[31mERROR: Could not find a version that satisfies the requirement transformer (from versions: none)[0m[31m

[0m[31mERROR: No matching distribution found for transformer[0m[31m





































In [None]:
from transformers import TrainingArguments, Trainer

# trainer_args = TrainingArguments(
#     output_dir='pegasus-samsum', num_train_epochs=2, warmup_steps=500,
#     per_device_train_batch_size=1, per_device_eval_batch_size=1,
#     weight_decay=0.01, logging_steps=10,
#     evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
#     gradient_accumulation_steps=16
# )
trainer_args = TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs=2,
    warmup_steps=500,
    per_device_train_batch_size=1,  # Reduce batch size to 4 to fit in 15 GB GPU
    per_device_eval_batch_size=1,   # Use the same batch size for evaluation
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=8  # Increase gradient accumulation steps to 4
)


In [None]:

trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"].select(range(8500)),
                  eval_dataset=dataset_samsum_pt["validation"])


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)



In [None]:
trainer.train()


Step,Training Loss,Validation Loss
500,1.8436,1.525926
1000,1.6151,1.435444
1500,1.5522,1.415576
2000,1.459,1.399975


TrainOutput(global_step=2124, training_loss=1.6569541726408705, metrics={'train_runtime': 3361.121, 'train_samples_per_second': 5.058, 'train_steps_per_second': 0.632, 'total_flos': 6347380160544768.0, 'train_loss': 1.6569541726408705, 'epoch': 2.0})

In [None]:
from datasets import load_metric

# Load the Rouge metric
rouge_metric = load_metric("rouge")


  rouge_metric = load_metric("rouge")


You can avoid this message in future by passing the argument `trust_remote_code=True`.

Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.



Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
en_paragraph="""LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how he\'ll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. "I\'ll definitely have some sort of party," he said in an interview. "Hopefully none of you will be reading about it." Radcliffe\'s earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. "People are always looking to say \'kid star goes off the rails,\'" he told reporters last month. "But I try very hard not to go that way because it would be too easy for them." His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films.  Watch I-Reporter give her review of Potter\'s latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called "My Boy Jack," about author Rudyard Kipling and his son, due for release later this year. He will also appear in "December Boys," an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer\'s "Equus." Meanwhile, he is braced for even closer media scrutiny now that he\'s legally an adult: "I just think I\'m going to be more sort of fair game," he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed."""

In [None]:
dataset_samsum_pt['test']['dialogue']

In [None]:
# #Tokenize the input text
# inputs = tokenizer(dataset_samsum_pt['test']['dialogue'], return_tensors="pt", max_length=1024, truncation=True)
# inputs = {key: value.to(model_pegasus.device) for key, value in inputs.items()}
# # Generate summary
# summary_ids = model_pegasus.generate(inputs["input_ids"], max_length=150, num_beams=8, length_penalty=0.0, early_stopping=True)

# summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# # print(inputs)
# # print(summary_ids)
# print("Input Text:", en_paragraph)
# print("Summary:", summary)

In [None]:
final_summary=[]
for dialogue in dataset_samsum_pt['test'][:100
]['dialogue']:
    # Tokenize the input text
    inputs = tokenizer(dialogue, return_tensors="pt", max_length=1024, truncation=True)
    inputs = {key: value.to(model_pegasus.device) for key, value in inputs.items()}

    # Generate summary
    summary_ids = model_pegasus.generate(inputs["input_ids"], max_length=150, num_beams=8, length_penalty=0.0, early_stopping=True)

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    final_summary.append(summary)
    # Print the input text and the generated summary
    # print("Input Text:", dialogue)
    # print("Summary:", summary)


In [None]:
final_summary


["Hannah has Betty's number. She doesn't know Larry well. Larry called Betty last time they were at the park together. Amanda will text Larry.",
 "Eric, Rob and Rob are going to watch some of Rob's stand-ups on Youtube. Eric likes the train part and Rob likes the machine.",
 'Lenny is looking for a pair of trousers. Bob has four pairs. Lenny already has purple trousers. Lenny will buy the first pair or the third pair.',
 'Emma and Will are going to have dinner tonight. Emma will be home soon. She will tell Will when she gets home that she will pick him up.',
 'Jane is in Warsaw. She lost her calendar, but she will see Ollie on Friday. They will have lunch together. Ollie will bring some sun with him.',
 'Benjamin, Elliot, Hilary and Daniel are meeting at La Cantina at 2 pm. Hilary is meeting French people who work on the history of food in colonial Mexico.',
 "Payton likes to buy clothes and books. Max likes shopping. Payton likes shopping, but he doesn't always buy what he likes. Max 

In [None]:
import evaluate

In [None]:
meteor = evaluate.load('meteor')
predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
results = meteor.compute(predictions=final_summary, references=dataset_samsum_pt['test'][:100
]['summary'])

[nltk_data] Downloading package wordnet to /root/nltk_data...

[nltk_data]   Package wordnet is already up-to-date!

[nltk_data] Downloading package punkt to /root/nltk_data...

[nltk_data]   Package punkt is already up-to-date!

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...

[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
results

{'meteor': 0.47139714569232083}

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [None]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:
score = calculate_metric_on_test_ds(
    dataset['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

100%|██████████| 410/410 [12:19<00:00,  1.80s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.018748,0.000387,0.018638,0.018662


In [None]:
## Save model
model_pegasus.save_pretrained("pegasus-samsum-model")


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [None]:
import shutil

shutil.make_archive("pegasus-samsum-model", 'zip', "pegasus-samsum-model")

'/content/pegasus-samsum-model.zip'

In [None]:
output_path = "/content/drive/My Drive/Colab Notebooks/"
shutil.move("pegasus-samsum-model_8500_2_epochs.zip", output_path)


'/content/drive/My Drive/Colab Notebooks/pegasus-samsum-model_8500_2_epochs.zip'

In [None]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
shutil.make_archive("tokenizer", 'zip', "tokenizer")

'/content/tokenizer.zip'

In [None]:
output_path = "/content/drive/My Drive/Colab Notebooks/"
shutil.move("tokenizer_8500_2_epochs.zip", output_path)

'/content/drive/My Drive/Colab Notebooks/tokenizer_8500_2_epochs.zip'

We have saved the model on Hugging face and load that from there

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("nijpadariya/Pegasus_fine_tune")
model = AutoModelForSeq2SeqLM.from_pretrained("nijpadariya/Pegasus_fine_tune")

In [None]:
dataset = load_dataset("samsum")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
model_summary=[]
count =0
for i in range(800):
    inputs = tokenizer(dataset['test']['dialogue'][i], return_tensors="pt", max_length=1024, truncation=True)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    # Generate summary
    summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=8, length_penalty=0.0, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    count = count+1
    model_summary.append(summary)


In [None]:
pip install -U sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
from sentence_transformers import SentenceTransformer, util
# sentences = ["It is a guide to action which ensures that the military always obeys the commands of the party", "It is a guide to action that ensures that the military will forever heed Party commands"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

final_val=0
count=0
#Compute embedding for both lists
for i in range(800):
    embedding_1= model.encode(dataset['test']['summary'][i], convert_to_tensor=True)
    embedding_2 = model.encode(model_summary[i], convert_to_tensor=True)

    val=util.pytorch_cos_sim(embedding_1, embedding_2)
    final_val=final_val+val.item()
    count=count+1
## tensor([[0.6003]])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
final_val=final_val/count
print(final_val)



0.7356697076559067


In [None]:

# sample_text = dataset_samsum["test"]["dialogue"]

# reference = dataset_samsum["test"]["summary"]

In [None]:

# gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}


In [None]:
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# model_ckpt = "nijpadariya/Pegasus_fine_tune"
# # model_ckpt2 = "google/pegasus-cnn_dailymail"

# tokenizer = AutoTokenizer.from_pretrained("nijpadariya/Pegasus_fine_tune")
# # tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:

# # print("Dialogue:")
# # print(sample_text)


# # print("\nReference Summary:")
# # print(reference)
# # Initialize an empty list to store the generated summaries
# generated_summaries = []

# # Loop over each sample text
# for text in sample_text:
#     # Generate a summary for the current text
#     summary = pipe(text, **gen_kwargs)[0]['generated_text']
#     # Append the generated summary to the list
#     generated_summaries.append(summary)

In [None]:
# print("summary Generated")