### Functions of calculating Metrics

In [1]:
# The code of calculating metrics are from:
# https://github.com/rxn4chemistry/paragraph2actions/blob/main/src/paragraph2actions/analysis.py
# https://github.com/rxn4chemistry/paragraph2actions/blob/main/src/paragraph2actions/scripts/calculate_metrics.py

import os
import pandas as pd
from typing import Tuple, Sequence, List, Optional, Iterable, Any, Iterator, Callable
import textdistance
from nltk.translate.bleu_score import corpus_bleu
import matplotlib.pyplot as plt

def all_identical(sequence: Sequence[Any]) -> bool:
    return all(s == sequence[0] for s in sequence)


def highlight_differences(source_sentences: List[str], translations: Sequence[List[str]]) -> None:
    """
    Will highlight sentences that are translated differently by different models.

    Args:
        source_sentences: Sentences to translate (length: L)
        translations: Multiple lists of translations, depending on the number of translation models (size: n_models x L)
    """
    assert all(len(l) == len(source_sentences) for l in translations)

    for i, sentence in enumerate(source_sentences):
        sentence_translations = [t[i] for t in translations]

        if not all_identical(sentence_translations):
            print(f'Sample {i}: {sentence}')
            for model_no, s in enumerate(sentence_translations, 1):
                print(f'{model_no}) {s}')
            print()


def full_sentence_accuracy(truth: List[str], pred: List[str]) -> float:
    """
    Calculate the number of exact matches.
    """
    assert len(truth) == len(pred)

    correct_count = sum(int(t == p) for t, p in zip(truth, pred))
    return correct_count / len(truth)

def modified_bleu(truth: List[str], pred: List[str]) -> float:
    """
    Calculates the BLEU score of a translation, with a small modification in order not to penalize sentences
    with less than 4 words.

    Returns:
        value between 0 and 1.
    """
    
    references = [sentence.split() for sentence in truth]
    candidates = [sentence.split() for sentence in pred]

    # BLEU penalizes sentences with only one word. Even correct translations get a score of zero.
    references = [r + max(0, 4 - len(r)) * [''] for r in references]
    candidates = [c + max(0, 4 - len(c)) * [''] for c in candidates]

    # references must have a larger depth because it supports multiple choices
    refs = [[r] for r in references]
    return corpus_bleu(refs, candidates)


def original_bleu(truth: List[str], pred: List[str]) -> float:
    """
    Calculates the BLEU score of a translation, with the original function from nltk.

    Returns:
        value between 0 and 1.
    """
    references = [sentence.split() for sentence in truth]
    candidates = [sentence.split() for sentence in pred]

    # references must have a larger depth because it supports multiple choices
    refs = [[r] for r in references]
    return corpus_bleu(refs, candidates)


def bleu2(truth, pred):
    references = [sentence.split() for sentence in truth]
    candidates = [sentence.split() for sentence in pred]
    refs = [[r] for r in references]
    bleu2 = corpus_bleu(refs, candidates, weights=(.5, .5))
    return bleu2


def levenshtein_similarity(truth: List[str], pred: List[str]) -> float:
    assert len(truth) == len(pred)
    scores = (textdistance.levenshtein.normalized_similarity(t, p) for t, p in zip(truth, pred))
    return sum(scores) / len(truth)


def partial_accuracy(truth: List[str], pred: List[str], threshold: float) -> float:
    """
    Calculates the accuracy from the fraction of sentences that have a similarity to the
    ground truth higher than a given threshold.

    For threshold == 1.0, this function is equivalent to full_sentence_accuracy.

    Args:
        truth: ground truth action sequences
        pred: predicted truth action sequences
        threshold: threshold above which to consider it as a partial match, between 0 and 1
    """
    assert len(truth) == len(pred)
    match_count = sum(
        1 for t, p in zip(truth, pred)
        if textdistance.levenshtein.normalized_similarity(t, p) >= threshold
    )
    return match_count / len(truth)

### Calculating Metrics

In [2]:
"""Calculate metrics for predictions generated by one or several translation models"""

# load predictions and ground truth
df = pd.read_csv("results/predictions/finetuned_gpt3.5_hand_annotated_train_augmented_unique_5_epoch.csv")#.fillna("0")

ground_truth = list(df['Actual Text'])
prediction = list(df['Generated Text']) 

# evaluations
print('Modified BLEU, pr:', modified_bleu(ground_truth, prediction))
print('BLEU-2, pr:', bleu2(ground_truth, prediction))
print('Levenshtein, pr:', levenshtein_similarity(ground_truth, prediction))
print('100% accuracy, pr:', partial_accuracy(ground_truth, prediction, 1.0))
print('90% accuracy, pr:', partial_accuracy(ground_truth, prediction, 0.9))
print('75% accuracy, pr:', partial_accuracy(ground_truth, prediction, 0.75))
print()

Modified BLEU, pr: 0.864221426194361
BLEU-2, pr: 0.8859320523310793
Levenshtein, pr: 0.8991550985984681
100% accuracy, pr: 0.6903409090909091
90% accuracy, pr: 0.78125
75% accuracy, pr: 0.8693181818181818

