In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from difflib import SequenceMatcher
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

def summarize_with_model(model_name, input_text):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    input_text = "summarize: " + input_text if "t5" not in model_name else input_text
    summary_pipeline = pipeline("summarization", model=model, tokenizer=tokenizer)
    summary = summary_pipeline(input_text, max_length=150, min_length=40, truncation=True)
    return summary[0]['summary_text']

def summarize_with_t5(model_name, input_text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    
    input_text = "summarize: " + input_text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    summary_ids = model.generate(input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary
 
def calculate_cosine_similarity(text1, text2):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

    embeddings1 = model.encode(text1, convert_to_tensor=True)
    embeddings2 = model.encode(text2, convert_to_tensor=True)
    similarity = 1 - cosine(embeddings1.cpu(), embeddings2.cpu())
    return similarity

def calculate_difflib_similarity(text1, text2):
    return SequenceMatcher(None, text1, text2).ratio()







In [2]:
import mysql.connector as msc
conn = msc.connect(
    host="localhost",
    user="root",
    password="1234",
    database="IR_policy"
)


In [3]:
cursor = conn.cursor()

In [4]:
cursor.execute("SELECT * FROM apps_data")
for row in cursor:    
    input_text=row[3]
    model_names = ['facebook/bart-large-cnn','google/pegasus-large']
    model_variants = ['t5-small', 't5-base', 't5-large']
    summaries_t5 = {variant: summarize_with_t5(variant, input_text) for variant in model_variants}
    summaries_new_models={variant: summarize_with_model(variant, input_text) for variant in model_names}
    print("For T-5")
    for model_name, summary in summaries_t5.items():
        print(f"Model: {model_name}")
        print("Summary:", summary)
        difflib_similarity = calculate_difflib_similarity(input_text, summary)
        cosine_similarity = calculate_cosine_similarity(input_text, summary)
        print(f"Difflib Similarity: {difflib_similarity}")
        print(f"Cosine Similarity: {cosine_similarity}\n")
    print("For New Models")
    for model_name, summary in summaries_new_models.items():
        print(f"Model: {model_name}")
        print("Summary:", summary)
        difflib_similarity = calculate_difflib_similarity(input_text, summary)
        cosine_similarity = calculate_cosine_similarity(input_text, summary)
        print(f"Difflib Similarity: {difflib_similarity}")
        print(f"Cosine Similarity: {cosine_similarity}\n")
    

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens hav

For T-5
Model: t5-small
Summary: WhatsApp Legal Info If you live in the European Region, WhatsApp Ireland Limited provides our Services to you under this Terms of Service and Privacy Policy. our Privacy Policy helps explain our data practices, including the information we process to provide our Services. our Privacy Policy talks about what information we process to provide our Services.
Difflib Similarity: 0.0010576015108593013
Cosine Similarity: 0.8031375408172607

Model: t5-base
Summary: WhatsApp Ireland Limited provides the services to you under this Terms of Service and Privacy Policy. our Privacy Policy helps explain our data practices, including the information we process to provide our Services. Respect for your privacy is coded into our DNA.
Difflib Similarity: 0.004472577038244324
Cosine Similarity: 0.8346625566482544

Model: t5-large
Summary: WhatsApp Ireland Limited provides the services to you under this Terms of Service and Privacy Policy. this Privacy Policy applies to al

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or 

For T-5
Model: t5-small
Summary: this privacy policy applies to you if you are a Weixin user. you are subject to the Weixin Agreement on Software License and Service of Tencent Weixin. if you change your phone number, this Privacy Policy applies to you.
Difflib Similarity: 0.0007284150144544854
Cosine Similarity: 0.7054513692855835

Model: t5-base
Summary: we respect your concerns about privacy and appreciate your trust and confidence in us. this summary is to help you navigate the Privacy Policy and it is not a substitute for reading everything. this Privacy Policy does not apply to you if you are a Weixin user.
Difflib Similarity: 0.0015009551532793597
Cosine Similarity: 0.636496365070343

Model: t5-large
Summary: WeChat respects your concerns about privacy and appreciates your trust and confidence in us. this Privacy Policy only applies to you if you are a WeChat user. you are a Weixin user if you have registered by linking a mobile number that uses international dialing code +86 ("

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


For T-5
Model: t5-small
Summary: our products and services provide fast and fun ways to express yourself, live at the moment, learn about the world and have fun together. we want to be upfront about the information we collect, how we use it, whom we share it with and the controls we give you to access, update and delete your information.
Difflib Similarity: 0.000544489314397205
Cosine Similarity: 0.45220687985420227

Model: t5-base
Summary: Snapchat, Bitmoji, Spectacles advertising, commerce and others link to this Privacy Policy. when you use these services, you’ll share some information with us. we want to be upfront about the information we collect, how we use it, who we share it with and the controls we give you to access, update and delete your information.
Difflib Similarity: 0.025203989120580236
Cosine Similarity: 0.8158561587333679

Model: t5-large
Summary: Snapchat is a camera company. we want to be upfront about the information we collect, how we use it, whom we share it with

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


KeyboardInterrupt: 