<a href="https://colab.research.google.com/github/tubagokhan/GrinCH/blob/main/OnlyTextRank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
!pip install datasets
!pip install py-rouge==1.1



In [36]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
from datasets import load_dataset
import time
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
def textSentenceCount(Text):
    number_of_sentences = sent_tokenize(Text)
    count=(len(number_of_sentences))
    return count

def preprocess_text(text):

    # Tokenize the text into sentences and words
    sentences = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sentences]

    # Convert words to lowercase
    words = [[word.lower() for word in word_list] for word_list in words]

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words("english"))
    words = [
        [word for word in word_list if word.isalnum() and word not in stop_words]
        for word_list in words
    ]

    return words, sentences

def sentence_similarity(sentence1, sentence2):
    # Calculate the similarity between two sentences using cosine distance
    all_words = list(set(sentence1 + sentence2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for word in sentence1:
        vector1[all_words.index(word)] += 1

    for word in sentence2:
        vector2[all_words.index(word)] += 1

    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences):
    # Build similarity matrix based on sentence similarities
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])

    return similarity_matrix

def textrank_summarize(text, num_sentences):
    # Preprocess the text
    words, sentences = preprocess_text(text)

    # Build similarity matrix
    similarity_matrix = build_similarity_matrix(words)

    # Apply PageRank algorithm
    nx_graph = nx.from_numpy_array(similarity_matrix)
    try:
        scores = nx.pagerank(nx_graph, max_iter=1000, tol=1e-6)
    except nx.PowerIterationFailedConvergence:
        print("######### Error with PageRank")
        print(sentences[0])
        summary=sentences[0]
        return summary

    # Rank sentences based on their scores
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)

    # Select the top N sentences for the summary
    summary_sentences = [sentence for score, sentence in ranked_sentences[:num_sentences]]
    summary = " ".join(summary_sentences)

    return summary


In [38]:
dataset = load_dataset("scientific_papers", "pubmed")

startTimeforOverall = time.time()
all_summary = []
all_goldstandart = []

N = 500
startN = 0

for d in range(N):
    startTimeforDocument = time.time()
    print("Document:", startN + d + 1)
    corpus = dataset['train']['article'][startN + d]
    print("Document sentence number:", textSentenceCount(corpus))

    if textSentenceCount(corpus) > 8:
        summary=textrank_summarize(corpus, 8)
        print("Summary sentence number:", textSentenceCount(summary))
        all_summary.append(summary)
    else:
      all_summary.append(corpus)
      print("Corpus is less than 9 sentence, summarization didn't apply sentence number:", textSentenceCount(corpus))

    all_goldstandart.append(dataset['train']['abstract'][startN + d])
    elapsedTimeforDocument = time.time() - startTimeforDocument
    elapsedTimeforAll = time.time() - startTimeforOverall
    print('Document processing time: '+time.strftime("%M:%S", time.gmtime(elapsedTimeforDocument)))
    print('Total processing time: '+time.strftime("%d:%H:%M:%S", time.gmtime(elapsedTimeforAll)))

    print("----------------------------------")


Document: 1
Document sentence number: 173
Summary sentence number: 8
Document processing time: 00:04
Total processing time: 01:00:00:04
----------------------------------
Document: 2
Document sentence number: 81
Summary sentence number: 8
Document processing time: 00:03
Total processing time: 01:00:00:07
----------------------------------
Document: 3
Document sentence number: 45
######### Error with PageRank
tardive dystonia ( td ) , a rarer side effect after longer exposure to antipsychotics , is characterized by local or general , sustained , involuntary contraction of a muscle or muscle group , with twisting movements , generally slow , which may affect the limbs , trunk , neck , or face .
Summary sentence number: 1
Document processing time: 00:03
Total processing time: 01:00:00:11
----------------------------------
Document: 4
Document sentence number: 199
Summary sentence number: 8
Document processing time: 00:11
Total processing time: 01:00:00:22
---------------------------------

In [39]:
import rouge

def prepare_results(m, p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

def rougeEvaluation(all_hypothesis, all_references):

    for aggregator in ['Avg']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=4,
                               limit_length=False,
                               length_limit=1000,
                               length_limit_type='words',
                               apply_avg=apply_avg,
                               apply_best=apply_best,
                               alpha=0.2, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
                print()
            else:
                print(prepare_results(metric, results['p'], results['r'], results['f']))
        print()



In [40]:
len(all_goldstandart)

500

In [41]:
rougeEvaluation(all_summary, all_goldstandart)

Evaluation with Avg
	rouge-1:	P: 37.85	R: 41.76	F1: 36.67
	rouge-2:	P: 14.57	R: 16.24	F1: 14.36
	rouge-3:	P:  8.33	R:  8.81	F1:  7.89
	rouge-4:	P:  5.69	R:  5.73	F1:  5.18
	rouge-l:	P: 38.32	R: 40.63	F1: 36.92
	rouge-w:	P: 20.68	R: 11.06	F1: 11.18



In [42]:

# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [43]:
# Save files

import json
import os


# Define the file path
file_path = "/content/drive/Othercomputers/My Laptop/DriveAccess/OnlySimpleCTextRank-PubMed.json"  # Replace with the desired file path

# Check if the file already exists
if os.path.exists(file_path):
    # Read the existing data from the JSON file
    with open(file_path, "r") as file:
        existing_data = json.load(file)

    # Combine the existing data with the new data (lists)
    existing_summary = existing_data.get("all_summary", [])
    existing_goldstandart = existing_data.get("all_goldstandart", [])

    # Assuming you have three new lists: new_intermediate_summary, new_summary, new_goldstandart
    # Sample new lists (replace these with your actual lists)
    new_summary = all_summary
    new_goldstandart = all_goldstandart


    # Merge the new lists with the existing ones
    all_summary = existing_summary + new_summary
    all_goldstandart = existing_goldstandart + new_goldstandart

# Combine the lists into a dictionary for easy serialization
data = {
    "all_summary": all_summary,
    "all_goldstandart": all_goldstandart
}

# Save the data to a JSON file
with open(file_path, "w") as file:
    json.dump(data, file)

print("Lists have been saved to", file_path)

Lists have been saved to /content/drive/Othercomputers/My Laptop/DriveAccess/OnlySimpleCTextRank-PubMed.json


In [44]:
print(len(data['all_summary']))

500


In [45]:
rougeEvaluation(data['all_summary'], data['all_goldstandart'])

Evaluation with Avg
	rouge-1:	P: 37.85	R: 41.76	F1: 36.67
	rouge-2:	P: 14.57	R: 16.24	F1: 14.36
	rouge-3:	P:  8.33	R:  8.81	F1:  7.89
	rouge-4:	P:  5.69	R:  5.73	F1:  5.18
	rouge-l:	P: 38.32	R: 40.63	F1: 36.92
	rouge-w:	P: 20.68	R: 11.06	F1: 11.18



In [46]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [47]:

sentencenumber=0
for i in range(len(all_summary)):
  sentencenumber=sentencenumber+textSentenceCount(all_summary[i])

avg=sentencenumber/len(all_summary)
avg

6.3

In [48]:

sentencenumber=0
for i in range(len(all_goldstandart)):
  sentencenumber=sentencenumber+textSentenceCount(all_goldstandart[i])

avg=sentencenumber/len(all_goldstandart)
avg

7.474