<a href="https://colab.research.google.com/github/tubagokhan/GrinCH/blob/main/GraphInClusterPubMedTextRank_nli_distilroberta_base_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install scikit-learn
!pip install matplotlib
!pip install sentence_transformers
!pip install py-rouge==1.1



In [9]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize,word_tokenize

from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import warnings
warnings.filterwarnings("ignore")
import re

from datasets import load_dataset
from scipy.spatial import distance

import math
from math import*

import rouge

import networkx as nx

import time
from transformers import logging
logging.set_verbosity_error()

import json

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [10]:
# Preprocessing method
def preprocess_corpus(text):
    # Remove special characters and extra whitespaces
    text = re.sub(r"[^a-zA-Z0-9\s.]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

def textSentenceCount(Text):
    number_of_sentences = sent_tokenize(Text)
    count=(len(number_of_sentences))
    return count

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def createSummaryUsingKMeans(corpus, modelName):
    sentences = sent_tokenize(corpus)
    model = SentenceTransformer(modelName)
    sentence_embeddings = model.encode(sentences)

    optimum_clusters = find_optimum_clusters(sentence_embeddings, int(len(sentences) / 3))
    print("Optimum cluster number:", optimum_clusters)

    # Perform kmean clustering
    kmeans = KMeans(n_clusters=optimum_clusters, random_state=0, n_init='auto').fit(sentence_embeddings)

    chosen_sentence_indexes=[]
    cluster_rank=[]
    for cluster_id in range(optimum_clusters):
        cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
        cluster_weight=clusterWeight(sentence_embeddings, cluster_indices)
        cluster_rank.append(cluster_weight)
        chosen_sentence_index=text_rank(sentence_embeddings, cluster_indices)
        chosen_sentence_indexes.append(chosen_sentence_index)

    if optimum_clusters>10:
      choosen_clusters=k_highest_indices(cluster_rank, 10)
      chosen_sentence_indexes = [chosen_sentence_indexes[i] for i in choosen_clusters]

    sorted_indexes=sorted(chosen_sentence_indexes)

    chosen_sentences = []
    for chosen_sentence_index in sorted_indexes:
        chosen_sentences.append(sentences[chosen_sentence_index])

    summary = " ".join(chosen_sentences)

    return summary

#function calculates the optimal number of clusters using the elbow method. The function plots the elbow curve, which shows the inertia values for different cluster numbers. The user can visually inspect the plot to determine the elbow point, indicating the optimal number of clusters.
def find_optimum_clusters(data, max_clusters):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        inertias.append(kmeans.inertia_)

    # Plotting the elbow curve
    #plt.plot(range(1, max_clusters + 1), inertias)
    #plt.xlabel("Number of Clusters")
    #plt.ylabel("Inertia")
    #plt.title("Elbow Curve")
    #plt.show()

    # Calculate the optimal number of clusters using the elbow method
    diff = np.diff(inertias)
    acceleration = np.diff(diff)
    opt_cluster_num = acceleration.argmin() + 2  # Adding 2 to get the index of the minimum acceleration
    return opt_cluster_num

def clusterWeight(sentence_embeddings, cluster_indices):
  length=len(cluster_indices)
  sentenceGraph =np.zeros((length, length))
  for x in range(length):
    for y in range(length):
      if x>y:
        similarity= cosine(sentence_embeddings[cluster_indices[x]],sentence_embeddings[cluster_indices[y]])
        sentenceGraph[x][y]=abs(similarity)
    #print(sentenceGraph)
  SumElement=(np.concatenate(sentenceGraph).sum())
  return round(SumElement,2)

def k_highest_indices(cluster_weight_list, k):
    # Enumerate the list to keep track of original indices
    enumerated_list = list(enumerate(cluster_weight_list))

    # Sort the enumerated list in descending order based on the float values
    sorted_list = sorted(enumerated_list, key=lambda x: x[1], reverse=True)

    # Extract the indices of the k highest elements
    k_highest_indices = [item[0] for item in sorted_list[:k]]

    return k_highest_indices

def text_rank(sentence_embeddings, indexes):
    # Filter embeddings based on the input indexes
    filtered_embeddings = np.array([sentence_embeddings[i] for i in indexes])

    # Calculate cosine similarity between filtered sentence embeddings
    similarity_matrix = cosine_similarity(filtered_embeddings, filtered_embeddings)

    # Create a graph using similarity matrix
    graph = nx.from_numpy_array(similarity_matrix)
    try:
      # Apply TextRank algorithm to rank the sentences
      scores = nx.pagerank(graph, max_iter=1000)
          # Get the index of the highest-ranked sentence
      highest_ranked_index = max(scores, key=scores.get)

      # Get the original index from the list of indexes
      highest_ranked_original_index = indexes[highest_ranked_index]
      return highest_ranked_original_index
    except:
      print("######### Error with PageRank")
      return indexes[0]





In [11]:
#modelName = 'all-mpnet-base-v2'
#modelName ='all-distilroberta-v1'
modelName='nli-distilroberta-base-v2'
dataset = load_dataset("scientific_papers", "pubmed")

startTimeforOverall = time.time()
all_summary = []
all_goldstandart = []

N = 500
startN = 0

for d in range(N):
    startTimeforDocument = time.time()
    print("Document:", startN + d + 1)
    corpus = dataset['train']['article'][startN + d]
    corpus = preprocess_corpus(corpus)
    print("Document sentence number:", textSentenceCount(corpus))


    if textSentenceCount(corpus) > 8:
        summary = createSummaryUsingKMeans(corpus, modelName)
        print("Summary sentence number:", textSentenceCount(summary))
        all_summary.append(summary)
    else:
      all_summary.append(corpus)
      print("Corpus is less than 9 sentence, summarization didn't apply sentence number:", textSentenceCount(corpus))

    all_goldstandart.append(dataset['train']['abstract'][startN + d])
    elapsedTimeforDocument = time.time() - startTimeforDocument
    elapsedTimeforAll = time.time() - startTimeforOverall
    print('Document processing time: '+time.strftime("%M:%S", time.gmtime(elapsedTimeforDocument)))
    print('Total processing time: '+time.strftime("%d:%H:%M:%S", time.gmtime(elapsedTimeforAll)))

    print("----------------------------------")


Document: 1
Document sentence number: 173


Downloading (…)7023f/.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)433037023f/README.md:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading (…)3037023f/config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)33037023f/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7023f/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)33037023f/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)037023f/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Optimum cluster number: 18
Summary sentence number: 10
Document processing time: 01:03
Total processing time: 01:00:01:03
----------------------------------
Document: 2
Document sentence number: 81
Optimum cluster number: 11
Summary sentence number: 10
Document processing time: 00:19
Total processing time: 01:00:01:22
----------------------------------
Document: 3
Document sentence number: 45
Optimum cluster number: 6
Summary sentence number: 6
Document processing time: 00:10
Total processing time: 01:00:01:33
----------------------------------
Document: 4
Document sentence number: 199
Optimum cluster number: 24
Summary sentence number: 10
Document processing time: 01:00
Total processing time: 01:00:02:33
----------------------------------
Document: 5
Document sentence number: 42
Optimum cluster number: 13
Summary sentence number: 10
Document processing time: 00:09
Total processing time: 01:00:02:42
----------------------------------
Document: 6
Document sentence number: 95
Optimum clu

In [12]:
def prepare_results(m, p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

def rougeEvaluation(all_hypothesis, all_references):

    for aggregator in ['Avg']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=4,
                               limit_length=False,
                               length_limit=1000,
                               length_limit_type='words',
                               apply_avg=apply_avg,
                               apply_best=apply_best,
                               alpha=0.2, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
                print()
            else:
                print(prepare_results(metric, results['p'], results['r'], results['f']))
        print()

In [13]:
len(all_goldstandart)

500

In [14]:
rougeEvaluation(all_summary, all_goldstandart)



Evaluation with Avg
	rouge-1:	P: 35.13	R: 49.82	F1: 43.96
	rouge-2:	P: 12.43	R: 16.55	F1: 14.79
	rouge-3:	P:  6.61	R:  8.03	F1:  7.30
	rouge-4:	P:  4.47	R:  5.00	F1:  4.62
	rouge-l:	P: 33.35	R: 44.30	F1: 40.27
	rouge-w:	P: 15.96	R: 11.49	F1: 11.73



In [15]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [16]:
# Save files

import json
import os

# Define the file path
file_path = "/content/drive/Othercomputers/My Laptop/DriveAccess/GraphInClusterPubMedTextRanksavedDocumentsandSummaries-nli-distilroberta-base-v2.json"  # Replace with the desired file path

# Check if the file already exists
if os.path.exists(file_path):
    # Read the existing data from the JSON file
    with open(file_path, "r") as file:
        existing_data = json.load(file)

    # Combine the existing data with the new data (lists)
    existing_summary = existing_data.get("all_summary", [])
    existing_goldstandart = existing_data.get("all_goldstandart", [])

    # Assuming you have three new lists: new_intermediate_summary, new_summary, new_goldstandart
    # Sample new lists (replace these with your actual lists)
    new_summary = all_summary
    new_goldstandart = all_goldstandart


    # Merge the new lists with the existing ones
    all_summary = existing_summary + new_summary
    all_goldstandart = existing_goldstandart + new_goldstandart

# Combine the lists into a dictionary for easy serialization
data = {
    "all_summary": all_summary,
    "all_goldstandart": all_goldstandart
}

# Save the data to a JSON file
with open(file_path, "w") as file:
    json.dump(data, file)

print("Lists have been saved to", file_path)


Lists have been saved to /content/drive/Othercomputers/My Laptop/DriveAccess/GraphInClusterPubMedTextRanksavedDocumentsandSummaries-nli-distilroberta-base-v2.json


In [17]:
print(len(data['all_summary']))

500


In [18]:
rougeEvaluation(data['all_summary'], data['all_goldstandart'])

Evaluation with Avg
	rouge-1:	P: 35.13	R: 49.82	F1: 43.96
	rouge-2:	P: 12.43	R: 16.55	F1: 14.79
	rouge-3:	P:  6.61	R:  8.03	F1:  7.30
	rouge-4:	P:  4.47	R:  5.00	F1:  4.62
	rouge-l:	P: 33.35	R: 44.30	F1: 40.27
	rouge-w:	P: 15.96	R: 11.49	F1: 11.73



In [19]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [20]:
sentencenumber=0
for i in range(len(all_summary)):
  sentencenumber=sentencenumber+textSentenceCount(all_summary[i])

avg=sentencenumber/len(all_summary)
avg

8.472

In [21]:
sentencenumber=0
for i in range(len(all_goldstandart)):
  sentencenumber=sentencenumber+textSentenceCount(all_goldstandart[i])

avg=sentencenumber/len(all_goldstandart)
avg

7.474