<a href="https://colab.research.google.com/github/tubagokhan/GrinCH/blob/main/GraphInClusterArxivTextRank_nli_distilroberta_base_v2_SentenceDistrubtion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install scikit-learn
!pip install matplotlib
!pip install sentence_transformers
!pip install py-rouge==1.1



In [24]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize,word_tokenize

from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import warnings
warnings.filterwarnings("ignore")
import re

from datasets import load_dataset
from scipy.spatial import distance

import math
from math import*

import rouge

import networkx as nx

import time
from transformers import logging
logging.set_verbosity_error()

import json
import os

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [25]:
# Preprocessing method
def preprocess_corpus(text):
    # Remove special characters and extra whitespaces
    text = re.sub(r"[^a-zA-Z0-9\s.]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

def textSentenceCount(Text):
    number_of_sentences = sent_tokenize(Text)
    count=(len(number_of_sentences))
    return count

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def createSummaryUsingKMeans(corpus, modelName):
    sentences = sent_tokenize(corpus)
    model = SentenceTransformer(modelName)
    sentence_embeddings = model.encode(sentences)

    if len(sentences) >1000:
      optimum_clusters = find_optimum_clusters(sentence_embeddings, 100)
      print("Optimum cluster number:", optimum_clusters)
    else:
      optimum_clusters = find_optimum_clusters(sentence_embeddings, int(len(sentences) / 3))
      print("Optimum cluster number:", optimum_clusters)

    # Perform kmean clustering
    kmeans = KMeans(n_clusters=optimum_clusters, random_state=0, n_init='auto').fit(sentence_embeddings)

    chosen_sentence_indexes=[]
    cluster_rank=[]
    for cluster_id in range(optimum_clusters):
        cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
        cluster_weight=clusterWeight(sentence_embeddings, cluster_indices)
        cluster_rank.append(cluster_weight)
        chosen_sentence_index=text_rank(sentence_embeddings, cluster_indices)
        chosen_sentence_indexes.append(chosen_sentence_index)

    if optimum_clusters>9:
      choosen_clusters=k_highest_indices(cluster_rank, 9)
      chosen_sentence_indexes = [chosen_sentence_indexes[i] for i in choosen_clusters]

    sorted_indexes=sorted(chosen_sentence_indexes)
    saveSummariesinJsonFile(len(sentences), sorted_indexes,file_Path_Sent_Dist)

    chosen_sentences = []
    for chosen_sentence_index in sorted_indexes:
        chosen_sentences.append(sentences[chosen_sentence_index])

    summary = " ".join(chosen_sentences)

    return summary

def convert_int64_to_int(obj):
    if isinstance(obj, int):
        return int(obj)
    raise TypeError

def saveSummariesinJsonFile(num_sents, sorted_indexes,file_Path_Sent_Dist):
    filePath=file_Path_Sent_Dist
    # Create a dictionary with the required data structure
    data = {
        "num_sents": num_sents,
        "summary": [[int(index)] for index in sorted_indexes]
    }

    # Check if the file path exists
    if os.path.exists(filePath):
        try:
            # If the file exists and is not empty, read the existing data
            with open(filePath, "r") as file:
                existing_data = json.load(file)
        except json.JSONDecodeError:
            # If the file exists but contains invalid JSON data, initialize with an empty list
            existing_data = []
    else:
        # If the file does not exist, create a new file with the data
        existing_data = []

    # Append the new data to the existing data
    existing_data.append(data)

    # Write the updated data to the file
    with open(filePath, "w") as file:
        json.dump(existing_data, file, indent=2, default=convert_int64_to_int)

#function calculates the optimal number of clusters using the elbow method. The function plots the elbow curve, which shows the inertia values for different cluster numbers. The user can visually inspect the plot to determine the elbow point, indicating the optimal number of clusters.
def find_optimum_clusters(data, max_clusters):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        inertias.append(kmeans.inertia_)

    # Plotting the elbow curve
    #plt.plot(range(1, max_clusters + 1), inertias)
    #plt.xlabel("Number of Clusters")
    #plt.ylabel("Inertia")
    #plt.title("Elbow Curve")
    #plt.show()

    # Calculate the optimal number of clusters using the elbow method
    diff = np.diff(inertias)
    acceleration = np.diff(diff)
    opt_cluster_num = acceleration.argmin() + 2  # Adding 2 to get the index of the minimum acceleration
    return opt_cluster_num

def clusterWeight(sentence_embeddings, cluster_indices):
  length=len(cluster_indices)
  sentenceGraph =np.zeros((length, length))
  for x in range(length):
    for y in range(length):
      if x>y:
        similarity= cosine(sentence_embeddings[cluster_indices[x]],sentence_embeddings[cluster_indices[y]])
        sentenceGraph[x][y]=abs(similarity)
    #print(sentenceGraph)
  SumElement=(np.concatenate(sentenceGraph).sum())
  return round(SumElement,2)

def k_highest_indices(cluster_weight_list, k):
    # Enumerate the list to keep track of original indices
    enumerated_list = list(enumerate(cluster_weight_list))

    # Sort the enumerated list in descending order based on the float values
    sorted_list = sorted(enumerated_list, key=lambda x: x[1], reverse=True)

    # Extract the indices of the k highest elements
    k_highest_indices = [item[0] for item in sorted_list[:k]]

    return k_highest_indices

def text_rank(sentence_embeddings, indexes):
    # Filter embeddings based on the input indexes
    filtered_embeddings = np.array([sentence_embeddings[i] for i in indexes])

    # Calculate cosine similarity between filtered sentence embeddings
    similarity_matrix = cosine_similarity(filtered_embeddings, filtered_embeddings)

    # Create a graph using similarity matrix
    graph = nx.from_numpy_array(similarity_matrix)
    try:
      # Apply TextRank algorithm to rank the sentences
      scores = nx.pagerank(graph, max_iter=1000)
          # Get the index of the highest-ranked sentence
      highest_ranked_index = max(scores, key=scores.get)

      # Get the original index from the list of indexes
      highest_ranked_original_index = indexes[highest_ranked_index]
      return highest_ranked_original_index
    except:
      print("######### Error with PageRank")
      return indexes[0]





In [26]:
def save_data_to_json(file_path, summary, goldstandard, document):
    try:
        data = {
            "all_summary": [],
            "all_goldstandard": [],
            "all_document": []
        }

        try:
            with open(file_path, 'r') as file:
                data = json.load(file)
        except FileNotFoundError:
            pass

        all_summary = data.get('all_summary', [])
        all_goldstandard = data.get('all_goldstandard', [])
        all_document = data.get('all_document', [])

        all_summary.append(summary)
        all_goldstandard.append(goldstandard)
        all_document.append(document)

        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(data, file)

        print("Data saved successfully.")
    except Exception as e:
        print(f"Error: {e}")

In [27]:
#modelName = 'all-mpnet-base-v2'
modelName ='nli-distilroberta-base-v2'
dataset = load_dataset("scientific_papers", "arxiv")
datasetName='-Arxiv'

path='/content/drive/Othercomputers/My Laptop/DriveAccess/'

# Define the file path
file_path = path+"GrinchAllGoldAndMySummaries-"+modelName+datasetName+".json"  # My Summaries, Gold Summaires, Documents
file_Path_Sent_Dist=path+"GrinchSentenceDistrubtionsOfSummaries-"+modelName+datasetName+".json"  # Sentence Distrubition Summaries, Summary Sentence Indexes


startTimeforOverall = time.time()
# document size 6440
# 3245, 3595 have problem iinclude 3500 sentneces
N = 1 # HOW MANY DOCUMENT MALCOLLLMMMMMMM
startN = 3245 # Which document we will start

for d in range(N):
    startTimeforDocument = time.time()
    print("Document:", startN + d + 1)
    corpus = dataset['test']['article'][startN + d]
    corpus = preprocess_corpus(corpus)
    print("Document sentence number:", textSentenceCount(corpus))

    summary=""
    if textSentenceCount(corpus) > 8:
        summary = createSummaryUsingKMeans(corpus, modelName)
        print("Summary sentence number:", textSentenceCount(summary))
    else:
      summary=corpus
      print("Corpus is less than 9 sentence, summarization didn't apply sentence number:", textSentenceCount(corpus))


    save_data_to_json(file_path, summary, dataset['test']['abstract'][startN + d], dataset['test']['article'][startN + d])

    elapsedTimeforDocument = time.time() - startTimeforDocument
    elapsedTimeforAll = time.time() - startTimeforOverall
    print('Document processing time: '+time.strftime("%M:%S", time.gmtime(elapsedTimeforDocument)))
    print('Total processing time: '+time.strftime("%d:%H:%M:%S", time.gmtime(elapsedTimeforAll)))

    print("----------------------------------")


Document: 3246
Document sentence number: 3461
Optimum cluster number: 87
Summary sentence number: 9
Data saved successfully.
Document processing time: 14:31
Total processing time: 01:00:14:31
----------------------------------


In [28]:
def prepare_results(m, p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

def rougeEvaluation(all_hypothesis, all_references):

    for aggregator in ['Avg']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=4,
                               limit_length=False,
                               length_limit=1000,
                               length_limit_type='words',
                               apply_avg=apply_avg,
                               apply_best=apply_best,
                               alpha=0.2, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
                print()
            else:
                print(prepare_results(metric, results['p'], results['r'], results['f']))
        print()

In [29]:
file_path="/content/drive/Othercomputers/My Laptop/DriveAccess/GrinchAllGoldAndMySummaries-nli-distilroberta-base-v2-Arxiv.json"

In [30]:
try:
  with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
    mysummaries = data.get('all_summary', [])
    goldstandardsummaries = data.get('all_goldstandard', [])
    documents = data.get('all_document', [])

    print("Data read successfully.")
except FileNotFoundError:
  print(f"Error: File '{file_path}' not found.")
  mysummaries, goldstandardsummaries, documents  = []
except Exception as e:
  print(f"Error: {e}")
  mysummaries, goldstandardsummaries, documents  = []

Data read successfully.


In [31]:
len(documents)

6440

In [32]:
dataset = load_dataset("scientific_papers", "arxiv")
len(dataset['test']['article'])

6440

In [33]:
rougeEvaluation(mysummaries, goldstandardsummaries)

Evaluation with Avg
	rouge-1:	P: 29.37	R: 51.05	F1: 42.70
	rouge-2:	P:  8.49	R: 14.90	F1: 12.41
	rouge-3:	P:  3.07	R:  5.37	F1:  4.47
	rouge-4:	P:  1.47	R:  2.55	F1:  2.13
	rouge-l:	P: 28.02	R: 44.36	F1: 38.62
	rouge-w:	P: 12.94	R: 11.59	F1: 11.44



In [34]:
def find_duplicates_indexes(lst):
    duplicates = {}
    for i, item in enumerate(lst):
        if item in duplicates:
            duplicates[item].append(i)
        else:
            duplicates[item] = [i]

    # Extract index lists for duplicate values
    duplicate_indexes = [indexes for indexes in duplicates.values() if len(indexes) > 1]

    # Flatten the list of index lists
    flattened_indexes = [index for indexes in duplicate_indexes for index in indexes]

    return flattened_indexes



result = find_duplicates_indexes(goldstandardsummaries)
print(result)

result = find_duplicates_indexes(documents)
print(result)

[]
[]


In [35]:
def find_missing_values_indexes(list1, list2):
    missing_indexes = [index for index, value in enumerate(list1) if value not in list2]
    return missing_indexes

# Example usage:
list1 = dataset['test']['article']
list2 = documents
missing_indexes = find_missing_values_indexes(list1, list2)
print(missing_indexes)
len(missing_indexes)

[]


0

In [36]:
sentencenumber=0
for i in range(len(mysummaries)):
  sentencenumber=sentencenumber+textSentenceCount(mysummaries[i])
avg=sentencenumber/len(mysummaries)
avg

8.755124223602484

In [37]:
sentencenumber=0
for i in range(len(goldstandardsummaries)):
  sentencenumber=sentencenumber+textSentenceCount(goldstandardsummaries[i])
avg=sentencenumber/len(goldstandardsummaries)
avg

6.226708074534161

In [38]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [39]:
print(len(mysummaries))

6440


In [40]:
'''for i in range (1):
  duplicatedItemIndex=5437
  del mysummaries[duplicatedItemIndex]
  del goldstandardsummaries[duplicatedItemIndex]
  del documents[duplicatedItemIndex]

result = find_duplicates_indexes(goldstandardsummaries)
print(result)

result = find_duplicates_indexes(documents)
print(result)
print(len(result))'''

'for i in range (1):\n  duplicatedItemIndex=5437\n  del mysummaries[duplicatedItemIndex]\n  del goldstandardsummaries[duplicatedItemIndex]\n  del documents[duplicatedItemIndex]\n\nresult = find_duplicates_indexes(goldstandardsummaries)\nprint(result)\n\nresult = find_duplicates_indexes(documents)\nprint(result)\nprint(len(result))'

In [41]:
#file_path_new="/content/drive/Othercomputers/My Laptop/DriveAccess/GrinchAllGoldAndMySummaries-nli-distilroberta-base-v2-Arxiv2.json"

In [42]:
'''data = {
  "all_summary": mysummaries,
  "all_goldstandard": goldstandardsummaries,
  "all_document": documents
}

with open(file_path_new, "w") as json_file:
  json.dump(data, json_file)'''

'data = {\n  "all_summary": mysummaries,\n  "all_goldstandard": goldstandardsummaries,\n  "all_document": documents\n}\n\nwith open(file_path_new, "w") as json_file:\n  json.dump(data, json_file)'