<a href="https://colab.research.google.com/github/tubagokhan/GrinCH/blob/main/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install py-rouge==1.1
!pip install sentence_transformers
!pip install evaluate
!pip install bert_score



In [3]:
import rouge
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk import sent_tokenize

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## ROUGE

In [4]:
def prepare_results(m, p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

def rougeEvaluation(all_hypothesis, all_references):

    for aggregator in ['Avg']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=4,
                               limit_length=False,
                               length_limit=1000,
                               length_limit_type='words',
                               apply_avg=apply_avg,
                               apply_best=apply_best,
                               alpha=0.2, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
                print()
            else:
                print(prepare_results(metric, results['p'], results['r'], results['f']))
        print()

## RESCORE

In [5]:

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def redundancyMeasure(corpus):
    sentences=sent_tokenize(corpus)
    length=len(sentences)

    #print("Sentence Count:"+ str(len(sentences)))

    if(len(sentences)<2):
      return 0

    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = model.encode(sentences)

    sentenceGraph =np.zeros((length, length))
    for x in range(length):
        for y in range(length):
            if x>y:
                similarity= cosine(sentence_embeddings[x],sentence_embeddings[y])
                sentenceGraph[x][y]=abs(similarity)
    #print(sentenceGraph)
    SumElement=(np.concatenate(sentenceGraph).sum())
    divide=(length-1)*length/2

    similarityScore= SumElement/divide

    #similarityScore2 =np.mean(sentenceGraph, dtype = np.float64)
    #print(similarityScore2)

    return (round(similarityScore,2))

def ReSCORE(corpusList):
  listOfSimilarityDoc=[]
  for doc in corpusList:
    docSimilarity=redundancyMeasure(doc)
    listOfSimilarityDoc.append(docSimilarity)
    #print(len(listOfSimilarityDoc))

  ReScore=round(sum(listOfSimilarityDoc)/len(listOfSimilarityDoc),2)
  return ReScore

## READ FILES

In [6]:
# Save files

import json
import os

# Define the file path
file_path = "/content/drive/Othercomputers/My Laptop/DriveAccess/GrinchAllGoldAndMySummaries-nli-distilroberta-base-v2-PubMed.json"  # Replace with the desired file path

all_summary=[]
all_goldstandart=[]

# Check if the file already exists
if os.path.exists(file_path):
    # Read the existing data from the JSON file
    with open(file_path, "r") as file:
        existing_data = json.load(file)

    # Combine the existing data with the new data (lists)
    all_summary = existing_data.get("all_summary", [])
    all_goldstandart = existing_data.get("all_goldstandard", [])


## EVALUATION

In [7]:
len(all_summary)

10

In [8]:
len(all_goldstandart)

10

In [9]:
rougeEvaluation(all_summary, all_goldstandart)

Evaluation with Avg
	rouge-1:	P: 39.17	R: 60.11	F1: 49.18
	rouge-2:	P: 16.24	R: 22.58	F1: 19.75
	rouge-3:	P:  8.34	R: 10.80	F1:  9.97
	rouge-4:	P:  5.55	R:  7.09	F1:  6.60
	rouge-l:	P: 37.56	R: 52.14	F1: 45.27
	rouge-w:	P: 18.79	R: 14.90	F1: 14.07



In [10]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [11]:
from bert_score import score

(P, R, F), hashname = score(all_summary, all_goldstandart, lang="en", return_hash=True , model_type="distilbert-base-uncased")
print(
    f"{hashname}: P={P.mean().item():.6f} R={R.mean().item():.6f} F={F.mean().item():.6f}"
)

distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.31.0): P=0.782079 R=0.814791 F=0.797278


In [12]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [13]:
print(ReSCORE(all_summary))

0.52


In [14]:
print(ReSCORE(all_goldstandart))

0.45


In [15]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')