<a href="https://colab.research.google.com/github/tubagokhan/GrinCH/blob/main/OnlySimpleClusteringOptimumClusterPubMed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
!pip install datasets
!pip install py-rouge==1.1



In [18]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
import re
from datasets import load_dataset
import time
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
def textSentenceCount(Text):
    number_of_sentences = sent_tokenize(Text)
    count=(len(number_of_sentences))
    return count

def preprocess_text(text):

    # Remove special characters and extra whitespaces
    text = re.sub(r"[^a-zA-Z0-9\s.]", "", text)
    text = re.sub(r"\s+", " ", text)

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    return sentences

def find_optimum_clusters(data, max_clusters):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        inertias.append(kmeans.inertia_)

    # Plotting the elbow curve
    #plt.plot(range(1, max_clusters + 1), inertias)
    #plt.xlabel("Number of Clusters")
    #plt.ylabel("Inertia")
    #plt.title("Elbow Curve")
    #plt.show()

    # Calculate the optimal number of clusters using the elbow method
    diff = np.diff(inertias)
    acceleration = np.diff(diff)
    opt_cluster_num = acceleration.argmin() + 2  # Adding 2 to get the index of the minimum acceleration
    return opt_cluster_num

def generate_summary(text):
    # Preprocess the text and tokenize it into sentences
    sentences = preprocess_text(text)

    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)

    # Determine the optimal number of clusters using the elbow method
    max_clusters = textSentenceCount(text)
    opt_num_clusters = optimum_clusters = find_optimum_clusters(X, int(len(sentences) / 3))

    # Initialize K-Means with the optimal number of clusters
    kmeans = KMeans(n_clusters=opt_num_clusters, n_init='auto', random_state=42)
    kmeans.fit(X)

    # Get the centroids of the clusters
    centroids = kmeans.cluster_centers_

    # Sort the centroids based on their distances to the data points
    sorted_centroids_indices = np.argsort(kmeans.transform(X).sum(axis=1).ravel())

    # Get the sentences that are closest to the centroids (i.e., representative sentences)
    representative_sentences = [sentences[i] for i in sorted_centroids_indices[:opt_num_clusters]]

    # Combine the representative sentences to form the summary
    summary = ' '.join(representative_sentences)
    return summary



In [20]:
dataset = load_dataset("scientific_papers", "pubmed")

startTimeforOverall = time.time()
all_summary = []
all_goldstandart = []

N = 500
startN = 0

for d in range(N):
    startTimeforDocument = time.time()
    print("Document:", startN + d + 1)
    corpus = dataset['train']['article'][startN + d]
    print("Document sentence number:", textSentenceCount(corpus))

    if textSentenceCount(corpus) > 8:
        summary=generate_summary(corpus)
        print("Summary sentence number:", textSentenceCount(summary))
        all_summary.append(summary)
    else:
      all_summary.append(corpus)
      print("Corpus is less than 9 sentence, summarization didn't apply sentence number:", textSentenceCount(corpus))

    all_goldstandart.append(dataset['train']['abstract'][startN + d])
    elapsedTimeforDocument = time.time() - startTimeforDocument
    elapsedTimeforAll = time.time() - startTimeforOverall
    print('Document processing time: '+time.strftime("%M:%S", time.gmtime(elapsedTimeforDocument)))
    print('Total processing time: '+time.strftime("%d:%H:%M:%S", time.gmtime(elapsedTimeforAll)))

    print("----------------------------------")


Document: 1
Document sentence number: 173
Summary sentence number: 47
Document processing time: 00:34
Total processing time: 01:00:00:34
----------------------------------
Document: 2
Document sentence number: 81
Summary sentence number: 15
Document processing time: 00:04
Total processing time: 01:00:00:39
----------------------------------
Document: 3
Document sentence number: 45
Summary sentence number: 7
Document processing time: 00:02
Total processing time: 01:00:00:41
----------------------------------
Document: 4
Document sentence number: 199
Summary sentence number: 64
Document processing time: 00:27
Total processing time: 01:00:01:08
----------------------------------
Document: 5
Document sentence number: 42
Summary sentence number: 7
Document processing time: 00:02
Total processing time: 01:00:01:11
----------------------------------
Document: 6
Document sentence number: 95
Summary sentence number: 7
Document processing time: 00:07
Total processing time: 01:00:01:19
----------

In [21]:
import rouge

def prepare_results(m, p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

def rougeEvaluation(all_hypothesis, all_references):

    for aggregator in ['Avg']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=4,
                               limit_length=False,
                               length_limit=1000,
                               length_limit_type='words',
                               apply_avg=apply_avg,
                               apply_best=apply_best,
                               alpha=0.2, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
                print()
            else:
                print(prepare_results(metric, results['p'], results['r'], results['f']))
        print()



In [22]:
len(all_goldstandart)

500

In [23]:
rougeEvaluation(all_summary, all_goldstandart)

Evaluation with Avg
	rouge-1:	P: 25.81	R: 53.97	F1: 38.63
	rouge-2:	P: 10.08	R: 21.02	F1: 14.92
	rouge-3:	P:  5.73	R: 10.82	F1:  7.86
	rouge-4:	P:  3.97	R:  6.71	F1:  5.00
	rouge-l:	P: 26.60	R: 49.26	F1: 38.14
	rouge-w:	P: 12.73	R: 13.26	F1: 11.72



In [24]:

# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [25]:
# Save files

import json
import os


# Define the file path
file_path = "/content/drive/Othercomputers/My Laptop/DriveAccess/OnlySimpleClusteringOptimumCluster-PubMed.json"  # Replace with the desired file path

# Check if the file already exists
if os.path.exists(file_path):
    # Read the existing data from the JSON file
    with open(file_path, "r") as file:
        existing_data = json.load(file)

    # Combine the existing data with the new data (lists)
    existing_summary = existing_data.get("all_summary", [])
    existing_goldstandart = existing_data.get("all_goldstandart", [])

    # Assuming you have three new lists: new_intermediate_summary, new_summary, new_goldstandart
    # Sample new lists (replace these with your actual lists)
    new_summary = all_summary
    new_goldstandart = all_goldstandart


    # Merge the new lists with the existing ones
    all_summary = existing_summary + new_summary
    all_goldstandart = existing_goldstandart + new_goldstandart

# Combine the lists into a dictionary for easy serialization
data = {
    "all_summary": all_summary,
    "all_goldstandart": all_goldstandart
}

# Save the data to a JSON file
with open(file_path, "w") as file:
    json.dump(data, file)

print("Lists have been saved to", file_path)

Lists have been saved to /content/drive/Othercomputers/My Laptop/DriveAccess/OnlySimpleClusteringOptimumCluster-PubMed.json


In [26]:
print(len(data['all_summary']))

500


In [27]:
rougeEvaluation(data['all_summary'], data['all_goldstandart'])

Evaluation with Avg
	rouge-1:	P: 25.81	R: 53.97	F1: 38.63
	rouge-2:	P: 10.08	R: 21.02	F1: 14.92
	rouge-3:	P:  5.73	R: 10.82	F1:  7.86
	rouge-4:	P:  3.97	R:  6.71	F1:  5.00
	rouge-l:	P: 26.60	R: 49.26	F1: 38.14
	rouge-w:	P: 12.73	R: 13.26	F1: 11.72



In [28]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [29]:

sentencenumber=0
for i in range(len(all_summary)):
  sentencenumber=sentencenumber+textSentenceCount(all_summary[i])

avg=sentencenumber/len(all_summary)
avg

18.468

In [30]:

sentencenumber=0
for i in range(len(all_goldstandart)):
  sentencenumber=sentencenumber+textSentenceCount(all_goldstandart[i])

avg=sentencenumber/len(all_goldstandart)
avg

7.474