<a href="https://colab.research.google.com/github/tubagokhan/GrinCH/blob/main/GraphInClusterPubMedTextRank_nli_distilroberta_base_v2_SentenceDistrubtion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install scikit-learn
!pip install matplotlib
!pip install sentence_transformers
!pip install py-rouge==1.1



In [26]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize,word_tokenize

from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import warnings
warnings.filterwarnings("ignore")
import re

from datasets import load_dataset
from scipy.spatial import distance

import math
from math import*

import rouge

import networkx as nx

import time
from transformers import logging
logging.set_verbosity_error()

import json
import os

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [27]:
# Preprocessing method
def preprocess_corpus(text):
    # Remove special characters and extra whitespaces
    text = re.sub(r"[^a-zA-Z0-9\s.]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

def textSentenceCount(Text):
    number_of_sentences = sent_tokenize(Text)
    count=(len(number_of_sentences))
    return count

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def createSummaryUsingKMeans(corpus, modelName):
    sentences = sent_tokenize(corpus)
    model = SentenceTransformer(modelName)
    sentence_embeddings = model.encode(sentences)

    optimum_clusters = find_optimum_clusters(sentence_embeddings, int(len(sentences) / 3))
    print("Optimum cluster number:", optimum_clusters)

    # Perform kmean clustering
    kmeans = KMeans(n_clusters=optimum_clusters, random_state=0, n_init='auto').fit(sentence_embeddings)

    chosen_sentence_indexes=[]
    cluster_rank=[]
    for cluster_id in range(optimum_clusters):
        cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
        cluster_weight=clusterWeight(sentence_embeddings, cluster_indices)
        cluster_rank.append(cluster_weight)
        chosen_sentence_index=text_rank(sentence_embeddings, cluster_indices)
        chosen_sentence_indexes.append(chosen_sentence_index)

    if optimum_clusters>10:
      choosen_clusters=k_highest_indices(cluster_rank, 10)
      chosen_sentence_indexes = [chosen_sentence_indexes[i] for i in choosen_clusters]

    sorted_indexes=sorted(chosen_sentence_indexes)
    saveSummariesinJsonFile(len(sentences), sorted_indexes,file_Path_Sent_Dist)

    chosen_sentences = []
    for chosen_sentence_index in sorted_indexes:
        chosen_sentences.append(sentences[chosen_sentence_index])

    summary = " ".join(chosen_sentences)

    return summary

def convert_int64_to_int(obj):
    if isinstance(obj, int):
        return int(obj)
    raise TypeError

def saveSummariesinJsonFile(num_sents, sorted_indexes,file_Path_Sent_Dist):
    filePath=file_Path_Sent_Dist
    # Create a dictionary with the required data structure
    data = {
        "num_sents": num_sents,
        "summary": [[int(index)] for index in sorted_indexes]
    }

    # Check if the file path exists
    if os.path.exists(filePath):
        try:
            # If the file exists and is not empty, read the existing data
            with open(filePath, "r") as file:
                existing_data = json.load(file)
        except json.JSONDecodeError:
            # If the file exists but contains invalid JSON data, initialize with an empty list
            existing_data = []
    else:
        # If the file does not exist, create a new file with the data
        existing_data = []

    # Append the new data to the existing data
    existing_data.append(data)

    # Write the updated data to the file
    with open(filePath, "w") as file:
        json.dump(existing_data, file, indent=2, default=convert_int64_to_int)

#function calculates the optimal number of clusters using the elbow method. The function plots the elbow curve, which shows the inertia values for different cluster numbers. The user can visually inspect the plot to determine the elbow point, indicating the optimal number of clusters.
def find_optimum_clusters(data, max_clusters):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        inertias.append(kmeans.inertia_)

    # Plotting the elbow curve
    #plt.plot(range(1, max_clusters + 1), inertias)
    #plt.xlabel("Number of Clusters")
    #plt.ylabel("Inertia")
    #plt.title("Elbow Curve")
    #plt.show()

    # Calculate the optimal number of clusters using the elbow method
    diff = np.diff(inertias)
    acceleration = np.diff(diff)
    opt_cluster_num = acceleration.argmin() + 2  # Adding 2 to get the index of the minimum acceleration
    return opt_cluster_num

def clusterWeight(sentence_embeddings, cluster_indices):
  length=len(cluster_indices)
  sentenceGraph =np.zeros((length, length))
  for x in range(length):
    for y in range(length):
      if x>y:
        similarity= cosine(sentence_embeddings[cluster_indices[x]],sentence_embeddings[cluster_indices[y]])
        sentenceGraph[x][y]=abs(similarity)
    #print(sentenceGraph)
  SumElement=(np.concatenate(sentenceGraph).sum())
  return round(SumElement,2)

def k_highest_indices(cluster_weight_list, k):
    # Enumerate the list to keep track of original indices
    enumerated_list = list(enumerate(cluster_weight_list))

    # Sort the enumerated list in descending order based on the float values
    sorted_list = sorted(enumerated_list, key=lambda x: x[1], reverse=True)

    # Extract the indices of the k highest elements
    k_highest_indices = [item[0] for item in sorted_list[:k]]

    return k_highest_indices

def text_rank(sentence_embeddings, indexes):
    # Filter embeddings based on the input indexes
    filtered_embeddings = np.array([sentence_embeddings[i] for i in indexes])

    # Calculate cosine similarity between filtered sentence embeddings
    similarity_matrix = cosine_similarity(filtered_embeddings, filtered_embeddings)

    # Create a graph using similarity matrix
    graph = nx.from_numpy_array(similarity_matrix)
    try:
      # Apply TextRank algorithm to rank the sentences
      scores = nx.pagerank(graph, max_iter=1000)
          # Get the index of the highest-ranked sentence
      highest_ranked_index = max(scores, key=scores.get)

      # Get the original index from the list of indexes
      highest_ranked_original_index = indexes[highest_ranked_index]
      return highest_ranked_original_index
    except:
      print("######### Error with PageRank")
      return indexes[0]





In [28]:
def save_data_to_json(file_path, summary, goldstandard, document):
    try:
        data = {
            "all_summary": [],
            "all_goldstandard": [],
            "all_document": []
        }

        try:
            with open(file_path, 'r') as file:
                data = json.load(file)
        except FileNotFoundError:
            pass

        all_summary = data.get('all_summary', [])
        all_goldstandard = data.get('all_goldstandard', [])
        all_document = data.get('all_document', [])

        all_summary.append(summary)
        all_goldstandard.append(goldstandard)
        all_document.append(document)

        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(data, file)

        print("Data saved successfully.")
    except Exception as e:
        print(f"Error: {e}")

In [29]:
#modelName = 'all-mpnet-base-v2'
modelName ='nli-distilroberta-base-v2'
dataset = load_dataset("scientific_papers", "pubmed")
datasetName='-PubMed'

path='/content/drive/Othercomputers/My Laptop/DriveAccess/'

# Define the file path
file_path = path+"GrinchAllGoldAndMySummaries-"+modelName+datasetName+".json"  # My Summaries, Gold Summaires, Documents
file_Path_Sent_Dist=path+"GrinchSentenceDistrubtionsOfSummaries-"+modelName+datasetName+".json"  # Sentence Distrubition Summaries, Summary Sentence Indexes


startTimeforOverall = time.time()

N = 100
startN = 6558

for d in range(N):
    startTimeforDocument = time.time()
    print("Document:", startN + d + 1)
    corpus = dataset['test']['article'][startN + d]
    corpus = preprocess_corpus(corpus)
    print("Document sentence number:", textSentenceCount(corpus))

    summary=""
    if textSentenceCount(corpus) > 8:
        summary = createSummaryUsingKMeans(corpus, modelName)
        print("Summary sentence number:", textSentenceCount(summary))
    else:
      summary=corpus
      print("Corpus is less than 9 sentence, summarization didn't apply sentence number:", textSentenceCount(corpus))


    save_data_to_json(file_path, summary, dataset['test']['abstract'][startN + d], dataset['test']['article'][startN + d])

    elapsedTimeforDocument = time.time() - startTimeforDocument
    elapsedTimeforAll = time.time() - startTimeforOverall
    print('Document processing time: '+time.strftime("%M:%S", time.gmtime(elapsedTimeforDocument)))
    print('Total processing time: '+time.strftime("%d:%H:%M:%S", time.gmtime(elapsedTimeforAll)))

    print("----------------------------------")


Document: 6559
Document sentence number: 71
Optimum cluster number: 21
Summary sentence number: 10
Data saved successfully.
Document processing time: 00:16
Total processing time: 01:00:00:16
----------------------------------
Document: 6560
Document sentence number: 63
Optimum cluster number: 16
Summary sentence number: 10
Data saved successfully.
Document processing time: 00:13
Total processing time: 01:00:00:29
----------------------------------
Document: 6561
Document sentence number: 117
Optimum cluster number: 4
Summary sentence number: 4
Data saved successfully.
Document processing time: 00:17
Total processing time: 01:00:00:47
----------------------------------
Document: 6562
Document sentence number: 60
Optimum cluster number: 15
Summary sentence number: 10
Data saved successfully.
Document processing time: 00:09
Total processing time: 01:00:00:56
----------------------------------
Document: 6563
Document sentence number: 77
Optimum cluster number: 10
Summary sentence number: 1

In [30]:
def prepare_results(m, p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

def rougeEvaluation(all_hypothesis, all_references):

    for aggregator in ['Avg']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=4,
                               limit_length=False,
                               length_limit=1000,
                               length_limit_type='words',
                               apply_avg=apply_avg,
                               apply_best=apply_best,
                               alpha=0.2, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
                print()
            else:
                print(prepare_results(metric, results['p'], results['r'], results['f']))
        print()

In [31]:
#file_path="/content/drive/Othercomputers/My Laptop/DriveAccess/GrinchAllGoldAndMySummaries-nli-distilroberta-base-v2-PubMed.json"

In [32]:
try:
  with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
    mysummaries = data.get('all_summary', [])
    goldstandardsummaries = data.get('all_goldstandard', [])
    documents = data.get('all_document', [])

    print("Data read successfully.")
except FileNotFoundError:
  print(f"Error: File '{file_path}' not found.")
  mysummaries, goldstandardsummaries, documents  = []
except Exception as e:
  print(f"Error: {e}")
  mysummaries, goldstandardsummaries, documents  = []

Data read successfully.


In [33]:
len(documents)

6661

In [34]:
dataset = load_dataset("scientific_papers", "pubmed")
len(dataset['test']['article'])

6658

In [35]:
rougeEvaluation(mysummaries, goldstandardsummaries)

Evaluation with Avg
	rouge-1:	P: 35.90	R: 52.23	F1: 45.80
	rouge-2:	P: 12.44	R: 17.45	F1: 15.44
	rouge-3:	P:  6.25	R:  8.34	F1:  7.47
	rouge-4:	P:  3.95	R:  5.07	F1:  4.58
	rouge-l:	P: 33.82	R: 46.08	F1: 41.66
	rouge-w:	P: 16.06	R: 12.04	F1: 12.20



In [47]:
def find_duplicates_indexes(lst):
    duplicates = {}
    for i, item in enumerate(lst):
        if item in duplicates:
            duplicates[item].append(i)
        else:
            duplicates[item] = [i]

    # Extract index lists for duplicate values
    duplicate_indexes = [indexes for indexes in duplicates.values() if len(indexes) > 1]

    # Flatten the list of index lists
    flattened_indexes = [index for indexes in duplicate_indexes for index in indexes]

    return flattened_indexes



result = find_duplicates_indexes(goldstandardsummaries)
print(result)

result = find_duplicates_indexes(documents)
print(result)

[609, 5848, 1810, 1813, 1811, 1814, 1812, 1815, 3239, 6646]
[871, 3447, 1810, 1813, 1811, 1814, 1812, 1815]


In [37]:
print(goldstandardsummaries[1810])

 although cortisone acetate is approved worldwide as corticosteroid substitution 
 therapy in congenital adrenal hyperplasia ( 21-hydroxylase deficiency ) , its effectiveness 
 is uncertain since its biologic activity depends on activation by 
 11-hydroxysteroid dehydrogenase ( 11-hsd ) . 
 we sought to 
 compare the effect of cortisone acetate with that of hydrocortisone . in 10 patients with 
 congenital adrenal hyperplasia , 
 cortisone acetate was replaced with hydrocortisone in 
 substitution therapy . during this change , 
 blood concentrations of 17-hydroxy - progesterone , 
 adrenocorticotropin ( acth ) , and requirements for each drug were monitored . 
 concentrations 
 of 17-hydroxyprogesterone decreased ( mean 10.1 vs. 48.6 ng / ml ) , as did those of acth . 
 
 cortisone acetate dose requirements averaged 33.9 mg / m2 , while hydrocortisone 
 dose requirements averaged only 20.3 mg / m2 . 
 in one of the patients resistant to 
 cortisone acetate therapy , dna sequences in t

In [38]:
print(goldstandardsummaries[1813])

 although cortisone acetate is approved worldwide as corticosteroid substitution 
 therapy in congenital adrenal hyperplasia ( 21-hydroxylase deficiency ) , its effectiveness 
 is uncertain since its biologic activity depends on activation by 
 11-hydroxysteroid dehydrogenase ( 11-hsd ) . 
 we sought to 
 compare the effect of cortisone acetate with that of hydrocortisone . in 10 patients with 
 congenital adrenal hyperplasia , 
 cortisone acetate was replaced with hydrocortisone in 
 substitution therapy . during this change , 
 blood concentrations of 17-hydroxy - progesterone , 
 adrenocorticotropin ( acth ) , and requirements for each drug were monitored . 
 concentrations 
 of 17-hydroxyprogesterone decreased ( mean 10.1 vs. 48.6 ng / ml ) , as did those of acth . 
 
 cortisone acetate dose requirements averaged 33.9 mg / m2 , while hydrocortisone 
 dose requirements averaged only 20.3 mg / m2 . 
 in one of the patients resistant to 
 cortisone acetate therapy , dna sequences in t

In [39]:
print(goldstandardsummaries[1811])

 mif-1 ( pro - leu - gly - nh2 ) is a tripeptide for which the therapeutic potential in parkinson 's disease and depression has been indicated by many studies . 
 however , the cellular mechanisms of action of mif-1 are not yet clear . here 
 , we show the specific brain regions responsive to mif-1 treatment by c - fos mapping , and determine the kinetics of cellular signaling by western blotting of perk , pstat3 , and c - fos in cultured neurons . 
 the immunoreactivity of c - fos was increased 4 hours after mif-1 treatment in brain regions critically involved in the regulation of mood , anxiety , depression , and memory . 
 the number of cells activated was greater after peripheral treatment ( intravenous delivery ) than after intracerebroventricular injection . 
 in cultured sh - sy5y neuronal cells , c - fos was induced time- and dose - dependently . 
 the activation of cellular c - fos was preceded by a transient increase of mitogen - activated protein kinase perk but a reduction 

In [40]:
print(goldstandardsummaries[1814])

 mif-1 ( pro - leu - gly - nh2 ) is a tripeptide for which the therapeutic potential in parkinson 's disease and depression has been indicated by many studies . 
 however , the cellular mechanisms of action of mif-1 are not yet clear . here 
 , we show the specific brain regions responsive to mif-1 treatment by c - fos mapping , and determine the kinetics of cellular signaling by western blotting of perk , pstat3 , and c - fos in cultured neurons . 
 the immunoreactivity of c - fos was increased 4 hours after mif-1 treatment in brain regions critically involved in the regulation of mood , anxiety , depression , and memory . 
 the number of cells activated was greater after peripheral treatment ( intravenous delivery ) than after intracerebroventricular injection . 
 in cultured sh - sy5y neuronal cells , c - fos was induced time- and dose - dependently . 
 the activation of cellular c - fos was preceded by a transient increase of mitogen - activated protein kinase perk but a reduction 

In [41]:
print(goldstandardsummaries[1812])

 background : robot - assisted surgery must be evaluated before its acceptance as an option for standard therapy in the pediatric population . 
 our objective is a comparison of results using the robot system with results for the laparoscopic and open approaches.methods:following irb approval , robot - assisted procedures were case - matched with controls , selected from 1994 to 2005 . 
 data for 150 nissen cases were divided equally into 3 groups [ robot ( r ) , laparoscopic ( l ) , and open ( o ) ] , comparing surgical times , length of hospitalization , and outcomes.results:the average age ( r = 11764 months , l = 10771 months , o = 8555 months , p<0.05 ) and weight ( r = 3723 kg , l = 3324 kg , o = 2417 kg , p<0.05 ) of the open group were lower comparatively . 
 robot operative times proved significantly longer compared with laparoscopic and open time ( r = 16061 min , l = 107 + 31 min , o = 7327 min , p<0.05 ) . 
 the robot had 2 conversions ( 2/50 , 4% ) , comparable to the lapa

In [42]:
print(goldstandardsummaries[1815])

 background : robot - assisted surgery must be evaluated before its acceptance as an option for standard therapy in the pediatric population . 
 our objective is a comparison of results using the robot system with results for the laparoscopic and open approaches.methods:following irb approval , robot - assisted procedures were case - matched with controls , selected from 1994 to 2005 . 
 data for 150 nissen cases were divided equally into 3 groups [ robot ( r ) , laparoscopic ( l ) , and open ( o ) ] , comparing surgical times , length of hospitalization , and outcomes.results:the average age ( r = 11764 months , l = 10771 months , o = 8555 months , p<0.05 ) and weight ( r = 3723 kg , l = 3324 kg , o = 2417 kg , p<0.05 ) of the open group were lower comparatively . 
 robot operative times proved significantly longer compared with laparoscopic and open time ( r = 16061 min , l = 107 + 31 min , o = 7327 min , p<0.05 ) . 
 the robot had 2 conversions ( 2/50 , 4% ) , comparable to the lapa

In [43]:
def find_missing_values_indexes(list1, list2):
    missing_indexes = [index for index, value in enumerate(list1) if value not in list2]
    return missing_indexes

# Example usage:
list1 = dataset['test']['article']
list2 = documents
missing_indexes = find_missing_values_indexes(list1, list2)
print(missing_indexes)
len(missing_indexes)

[]


0

In [44]:
sentencenumber=0
for i in range(len(mysummaries)):
  sentencenumber=sentencenumber+textSentenceCount(mysummaries[i])
avg=sentencenumber/len(mysummaries)
avg

8.744632937997297

In [45]:
sentencenumber=0
for i in range(len(goldstandardsummaries)):
  sentencenumber=sentencenumber+textSentenceCount(goldstandardsummaries[i])
avg=sentencenumber/len(goldstandardsummaries)
avg

7.5874493319321425

In [46]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [50]:
duplicatedItemIndex=1813
del mysummaries[duplicatedItemIndex]
del goldstandardsummaries[duplicatedItemIndex]
del documents[duplicatedItemIndex]

result = find_duplicates_indexes(goldstandardsummaries)
print(result)

result = find_duplicates_indexes(documents)
print(result)

[609, 5845, 3236, 6643]
[871, 3444]


In [53]:
print(len(mysummaries))

6658


In [54]:
rougeEvaluation(mysummaries, goldstandardsummaries)

Evaluation with Avg
	rouge-1:	P: 35.90	R: 52.23	F1: 45.80
	rouge-2:	P: 12.43	R: 17.45	F1: 15.44
	rouge-3:	P:  6.25	R:  8.34	F1:  7.47
	rouge-4:	P:  3.95	R:  5.07	F1:  4.57
	rouge-l:	P: 33.82	R: 46.09	F1: 41.66
	rouge-w:	P: 16.06	R: 12.04	F1: 12.20



In [55]:
file_path_new="/content/drive/Othercomputers/My Laptop/DriveAccess/GrinchAllGoldAndMySummaries-nli-distilroberta-base-v2-PubMed_2.json"

In [56]:

data = {
  "all_summary": mysummaries,
  "all_goldstandard": goldstandardsummaries,
  "all_document": documents
}

with open(file_path_new, "w") as json_file:
  json.dump(data, json_file)