<a href="https://colab.research.google.com/github/tubagokhan/SummarizationHybrid/blob/main/GraphandClusterPubMedNoWRank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install scikit-learn
!pip install matplotlib
!pip install sentence_transformers
!pip install py-rouge==1.1



In [19]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize,word_tokenize

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import warnings
warnings.filterwarnings("ignore")
import re

from datasets import load_dataset
from scipy.spatial import distance

import math
from math import*

import rouge

import networkx as nx

import time
from transformers import logging
logging.set_verbosity_error()

import json

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
# Preprocessing method
def preprocess_corpus(text):
    # Remove special characters and extra whitespaces
    text = re.sub(r"[^a-zA-Z0-9\s.]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

## CLUSTER

In [21]:
#function calculates the optimal number of clusters using the elbow method. The function plots the elbow curve, which shows the inertia values for different cluster numbers. The user can visually inspect the plot to determine the elbow point, indicating the optimal number of clusters.
def find_optimum_clusters(data, max_clusters):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        inertias.append(kmeans.inertia_)

    # Plotting the elbow curve
    #plt.plot(range(1, max_clusters + 1), inertias)
    #plt.xlabel("Number of Clusters")
    #plt.ylabel("Inertia")
    #plt.title("Elbow Curve")
    #plt.show()

    # Calculate the optimal number of clusters using the elbow method
    diff = np.diff(inertias)
    acceleration = np.diff(diff)
    opt_cluster_num = acceleration.argmin() + 2  # Adding 2 to get the index of the minimum acceleration
    return opt_cluster_num

In [22]:
def createSummaryUsingKMeans(corpus, modelName):
    sentences = sent_tokenize(corpus)
    model = SentenceTransformer(modelName)
    sentence_embeddings = model.encode(sentences)

    optimum_clusters = find_optimum_clusters(sentence_embeddings, int(len(sentences) / 3))
    print("Optimum cluster number:", optimum_clusters)

    # Perform kmean clustering
    kmeans = KMeans(n_clusters=optimum_clusters, random_state=0, n_init='auto').fit(sentence_embeddings)

    chosen_sentence_indexes=[]
    for cluster_id in range(optimum_clusters):
        cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
        sorted_cluster_indices = sorted(cluster_indices, key=lambda x: sentences[x])
        chosen_sentence_index = sorted_cluster_indices[0]  # Select the first sentence from the sorted indices
        chosen_sentence_indexes.append(chosen_sentence_index)

    sorted_indexes=sorted(chosen_sentence_indexes)

    chosen_sentences = []
    for chosen_sentence_index in sorted_indexes:
        chosen_sentences.append(sentences[chosen_sentence_index])

    summary = " ".join(chosen_sentences)

    return summary


## GRAPH NODE WEIGHTS

In [23]:
def allCorpusSentenceRanking(tokenizedCorpus,corpus):
    sentenceRankList=[]
    for i in range(len(tokenizedCorpus)):
        value=sentenceRanking(tokenizedCorpus[i],i,corpus)
        value=round(value,5)
        sentenceRankList.append(value)
    return sentenceRankList

def sentenceRanking(sentence,location,corpus):
    value=0
    value=sentencePosition(sentence,location,corpus)
    value=value+sentenceLength(sentence,corpus)
    value=value+properNoun(sentence,corpus)
    value=value+numericalToken(sentence,corpus)
    return value

def textWordCount(Text):
    number_of_words = word_tokenize(Text)
    count=(len(number_of_words))
    return count

def textSentenceCount(Text):
    number_of_sentences = sent_tokenize(Text)
    count=(len(number_of_sentences))
    return count

def longestSentenceLenght(Text):
    text=sent_tokenize(Text)
    temp=0
    for i in range(len(text)):
        if temp<textWordCount(text[i]):
            temp=textWordCount(text[i])
    return temp

def sentencePosition(sentence,location,corpus):
    N=textSentenceCount(corpus)
    if location+1 == N:
        return 1.0
    elif location==0:
        return 1.0
    else:
        value=(N-location)/N
        return value

def sentenceLength(sentence,corpus):
    return textWordCount(sentence)/longestSentenceLenght(corpus)

def properNoun(sentence,corpus):
    text = nltk.word_tokenize(sentence)
    tagged=nltk.pos_tag(text)
    noProperNoun=0
    #print(tagged)
    for word in tagged:
        if word[1]=='NNP':
            noProperNoun=noProperNoun+1
    #print(noProperNoun)
    return noProperNoun/len(text)

def numericalToken(sentence,corpus):
    text = nltk.word_tokenize(sentence)
    tagged=nltk.pos_tag(text)
    noNumericalToken=0
    #print(tagged)
    for word in tagged:
        if word[1]=='CD':
            noNumericalToken=noNumericalToken+1
  #print(noProperNoun)
    return 1-(noNumericalToken/len(text))

## GRAPH

In [24]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def createGraph(sentences, modelName):
    model = SentenceTransformer(modelName)
    sentence_embeddings = model.encode(sentences)
    sentenceGraph =np.zeros((len(sentences), len(sentences)))
    temp = np.arange(len(sentences))
    for x in range(len(sentences)):
        newTemp= np.delete(temp, x)
        for y in newTemp:
            similarity= cosine(sentence_embeddings[x],sentence_embeddings[y]) # You can change the vector similarity measurement method used when creating graphs. Cosine, euclidean, manhattan and minkowski methods are defined.
            sentenceGraph[x][y]=abs(similarity)
    return sentenceGraph

def findHighestSimilarityRank(similarityMatrix, initialRank):
    newRank=[0] * len(similarityMatrix)
    temp=0
    for i in range(len(similarityMatrix)):
        for j in range(len(similarityMatrix)):
            temp=temp+similarityMatrix[i][j] # sum of total similarity of sentences
        newRank[i]=temp*initialRank[i]
        temp=0

    return newRank

def findHighestSimilarityRankNoWRANK(similarityMatrix, initialRank):

    nodeWeightLSARanks = initialRank

    for x in range(len(initialRank)):
        for y in range(len(initialRank)):
            temp=initialRank[x]+initialRank[y]
            similarityMatrix[x][y]=similarityMatrix[x][y]+temp*0.01
    G = nx.Graph()
    for x in range(len(initialRank)):
        for y in range(len(initialRank)):
            G.add_node(x)
            G.add_edge(x, y, weight=similarityMatrix[x,y] )
    eigenVectorCentrality = nx.eigenvector_centrality(G, max_iter=100, tol=1.0e-6, nstart=None, weight='weight')

    edgeWeightEigenVectorRanks=[0]*len(eigenVectorCentrality)
    for i in range (len(eigenVectorCentrality)):
        edgeWeightEigenVectorRanks[i]=eigenVectorCentrality[i]

    return edgeWeightEigenVectorRanks

def createSummaryUsingGraph(corpus,modelName):
  sentences = sent_tokenize(corpus)
  summaryAmmount= int(len(sentences)*0.2)
  initialRank=allCorpusSentenceRanking(sentences,corpus)
  similarityMatrix=createGraph(sentences,modelName)
  sentencesRank=findHighestSimilarityRankNoWRANK(similarityMatrix, initialRank)

  temp=sorted(sentencesRank)
  threshold=temp[-summaryAmmount]
  summarySentencesIndexes=[]

  for i in range(len(sentencesRank)):
      if sentencesRank[i]>=threshold:
        summarySentencesIndexes.append(i)

  #print(summarySentencesIndexes)
  summary=""
  for i in range(len(summarySentencesIndexes)):
    summary=summary + ' ' +sentences[summarySentencesIndexes[i]]
  return summary


In [25]:
#MAIN
modelName='all-mpnet-base-v2'

#dataset = load_dataset('cnn_dailymail', '3.0.0')
#corpus = dataset['train']['article'][50]


dataset = load_dataset("scientific_papers","pubmed")


startTimeforOverall = time.time()
all_intermediate_summary=[]
all_summary=[]
all_goldstandart=[]

N=500
startN=0

for d in range(N):
    startTimeforDocument = time.time()
    print("Document:",startN+d+1)
    corpus=dataset['train']['article'][startN+d]
    corpus = preprocess_corpus(corpus)
    #print(corpus)
    print("Document sentence number:",textSentenceCount(corpus))
    all_goldstandart.append(dataset['train']['abstract'][startN+d])

    if (textSentenceCount(corpus)>8):
      intermadiatesummary=createSummaryUsingGraph(corpus, modelName)
      print("Intermediate summary sentence number:",textSentenceCount(intermadiatesummary))
      all_intermediate_summary.append(intermadiatesummary)

      if (textSentenceCount(intermadiatesummary)>8):
        summary=createSummaryUsingKMeans(intermadiatesummary, modelName)
        print("Final summary sentence number:",textSentenceCount(summary))
        all_summary.append(summary)

      else:
        print("Intermediate summary is shorter than 9 sentences, Second step didn't applied")
        all_summary.append(intermadiatesummary)


    else:
      print("Document is shorter than 9 sentences, Summarization didnt apply")
      all_intermediate_summary.append(corpus)
      all_summary.append(corpus)



    elapsedTimeforDocument = time.time() - startTimeforDocument
    elapsedTimeforAll = time.time() - startTimeforOverall
    print('Document processing time: '+time.strftime("%M:%S", time.gmtime(elapsedTimeforDocument)))
    print('Total processing time: '+time.strftime("%d:%H:%M:%S", time.gmtime(elapsedTimeforAll)))

    print("----------------------------------")




  0%|          | 0/3 [00:00<?, ?it/s]

Document: 1
Document sentence number: 173
Intermediate summary sentence number: 34
Optimum cluster number: 6
Final summary sentence number: 6
Document processing time: 00:19
Total processing time: 01:00:00:19
----------------------------------
Document: 2
Document sentence number: 81
Intermediate summary sentence number: 16
Optimum cluster number: 4
Final summary sentence number: 4
Document processing time: 00:10
Total processing time: 01:00:00:30
----------------------------------
Document: 3
Document sentence number: 45
Intermediate summary sentence number: 9
Optimum cluster number: 2
Final summary sentence number: 2
Document processing time: 00:06
Total processing time: 01:00:00:36
----------------------------------
Document: 4
Document sentence number: 199
Intermediate summary sentence number: 39
Optimum cluster number: 7
Final summary sentence number: 7
Document processing time: 00:24
Total processing time: 01:00:01:00
----------------------------------
Document: 5
Document senten

In [26]:
def prepare_results(m, p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

def rougeEvaluation(all_hypothesis, all_references):

    for aggregator in ['Avg']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=4,
                               limit_length=False,
                               length_limit=1000,
                               length_limit_type='words',
                               apply_avg=apply_avg,
                               apply_best=apply_best,
                               alpha=0.2, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
                print()
            else:
                print(prepare_results(metric, results['p'], results['r'], results['f']))
        print()

In [27]:
rougeEvaluation(all_intermediate_summary, all_goldstandart)

Evaluation with Avg
	rouge-1:	P: 23.53	R: 65.86	F1: 44.18
	rouge-2:	P: 10.83	R: 29.32	F1: 19.72
	rouge-3:	P:  6.53	R: 16.19	F1: 11.13
	rouge-4:	P:  4.64	R: 10.53	F1:  7.38
	rouge-l:	P: 24.72	R: 58.85	F1: 43.15
	rouge-w:	P: 11.54	R: 16.29	F1: 13.84



In [28]:
rougeEvaluation(all_summary, all_goldstandart)

Evaluation with Avg
	rouge-1:	P: 44.05	R: 37.89	F1: 36.30
	rouge-2:	P: 16.05	R: 13.27	F1: 12.73
	rouge-3:	P:  8.84	R:  6.89	F1:  6.66
	rouge-4:	P:  6.00	R:  4.46	F1:  4.33
	rouge-l:	P: 41.70	R: 36.09	F1: 35.27
	rouge-w:	P: 21.46	R:  9.32	F1:  9.97



In [29]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [30]:
# Save files

import json
import os

# Define the file path
file_path = "/content/savedDocumentsandSummaries.json"  # Replace with the desired file path

# Check if the file already exists
if os.path.exists(file_path):
    # Read the existing data from the JSON file
    with open(file_path, "r") as file:
        existing_data = json.load(file)

    # Combine the existing data with the new data (lists)
    existing_intermediate_summary = existing_data.get("all_intermediate_summary", [])
    existing_summary = existing_data.get("all_summary", [])
    existing_goldstandart = existing_data.get("all_goldstandart", [])

    # Assuming you have three new lists: new_intermediate_summary, new_summary, new_goldstandart
    # Sample new lists (replace these with your actual lists)
    new_intermediate_summary = all_intermediate_summary
    new_summary = all_summary
    new_goldstandart = all_goldstandart


    # Merge the new lists with the existing ones
    all_intermediate_summary = existing_intermediate_summary + new_intermediate_summary
    all_summary = existing_summary + new_summary
    all_goldstandart = existing_goldstandart + new_goldstandart

# Combine the lists into a dictionary for easy serialization
data = {
    "all_intermediate_summary": all_intermediate_summary,
    "all_summary": all_summary,
    "all_goldstandart": all_goldstandart
}

# Save the data to a JSON file
with open(file_path, "w") as file:
    json.dump(data, file)

print("Lists have been saved to", file_path)


Lists have been saved to /content/savedDocumentsandSummaries.json


In [31]:
print(len(data['all_summary']))

500


In [32]:
rougeEvaluation(data['all_intermediate_summary'], data['all_goldstandart'])

Evaluation with Avg
	rouge-1:	P: 23.53	R: 65.86	F1: 44.18
	rouge-2:	P: 10.83	R: 29.32	F1: 19.72
	rouge-3:	P:  6.53	R: 16.19	F1: 11.13
	rouge-4:	P:  4.64	R: 10.53	F1:  7.38
	rouge-l:	P: 24.72	R: 58.85	F1: 43.15
	rouge-w:	P: 11.54	R: 16.29	F1: 13.84



In [33]:
rougeEvaluation(data['all_summary'], data['all_goldstandart'])

Evaluation with Avg
	rouge-1:	P: 44.05	R: 37.89	F1: 36.30
	rouge-2:	P: 16.05	R: 13.27	F1: 12.73
	rouge-3:	P:  8.84	R:  6.89	F1:  6.66
	rouge-4:	P:  6.00	R:  4.46	F1:  4.33
	rouge-l:	P: 41.70	R: 36.09	F1: 35.27
	rouge-w:	P: 21.46	R:  9.32	F1:  9.97



In [34]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [35]:
sentencenumber=0
for i in range(len(all_summary)):
  sentencenumber=sentencenumber+textSentenceCount(all_summary[i])

avg=sentencenumber/len(all_summary)

In [36]:
avg

4.862