<a href="https://colab.research.google.com/github/yuvanshanka4/dataaces_works/blob/main/day5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install transformers
!pip install multiprocessing
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.1 MB/s[0m eta [36m0:00:0

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


excel_file_path = '/content/Dataset1_with_bert_summaries.xlsx'
df = pd.read_excel(excel_file_path)


summary_column_name = 'summary'
snippet_column_name = 'snippet'


tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df[snippet_column_name].fillna(''))


similarities = []


for index, row in df.iterrows():
    summary = row[summary_column_name]

    try:

        summary_vector = tfidf_vectorizer.transform([summary])
        snippet_vector = tfidf_matrix[index]
        similarity_score = cosine_similarity(summary_vector, snippet_vector)


        similarity_percentage = (similarity_score[0][0] + 1) * 50
        similarities.append(similarity_percentage)
    except Exception as e:
        print(f"Error calculating similarity for row {index + 1}: {e}")
        similarities.append(0)

df['similarity'] = similarities

updated_excel_file_path = 'UpdatatedDataset1_full.xlsx'
df.to_excel(updated_excel_file_path, index=False)

print("Similarity percentages added and new Excel file created.")

Similarity percentages added and new Excel file created.


In [9]:
!pip install torch




In [3]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import multiprocessing

excel_file_path = 'Dataset1_full.xlsx'
df = pd.read_excel(excel_file_path)

content_column_name = 'content'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

num_processes = 4

def generate_summary(content):
    max_length = model.config.max_position_embeddings - 2
    content = content[:max_length]

    input_text = "[CLS] " + content + " [SEP]"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(input_ids)
    summary = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    return summary
def process_chunk(chunk):
    return [generate_summary(content) for content in chunk]
chunk_size = len(df) // num_processes
data_chunks = [df[content_column_name].values.tolist()[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

with multiprocessing.Pool(processes=num_processes) as pool:
    summarizations_chunks = pool.map(process_chunk, data_chunks)

summarizations = [summary for chunk in summarizations_chunks for summary in chunk]

df['summary'] = summarizations
updated_excel_file_path = 'Dataset1_with_bert_summaries.xlsx'
df.to_excel(updated_excel_file_path, index=False)

print("BERT summaries added and new Excel file created.")


BERT summaries added and new Excel file created.
