In [2]:
import json

with open('../reviews/notes.jsonl') as f:
    reviews = [json.loads(line) for line in f]

In [3]:
from typing import Dict, List, Any

def extract_review_messages(review: Dict[str, Any]) -> List[str]:
    """
    Extracts the review messages from the given review data structure.

    The function navigates through the nested structure of the input dictionary,
    specifically looking for a 'reviews_msg' key. It then iterates over its contents,
    extracting the 'review' field from each 'content' sub-dictionary.

    Args:
        review (Dict[str, Any]): A dictionary containing review information, 
        where the key 'reviews_msg' is expected to be a list of dictionaries
        with a 'content' key, which in turn should contain a 'review' key with the actual message.

    Returns:
        List[str]: A list of review messages extracted from the input dictionary. 
        If no relevant messages are found, returns an empty list.
    """
    review_messages = []
    if 'reviews_msg' in review:
        for message in review['reviews_msg']:
            if 'content' in message and 'review' in message['content']:
                review_messages.append(message['content']['review'])

    return review_messages


In [4]:
review_messages = extract_review_messages(reviews[0])

In [4]:
print("Length of review messages:", len(review_messages))

Length of review messages: 3


# Evaluation

In [5]:
human_review_messages = review_messages[:2]
generated_review_messages = review_messages[2:]

## ROUGE-L and Cosine Similarity

In [7]:
import numpy as np
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


ROUGEL_SCORER = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
COSINE_SIM_EMBED = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
def calculate_rouge_l(human_reviews: List[str], generated_reviews: List[str]) -> float:
    """
    Calculates the ROUGE-L score between two sets of reviews, typically human-written and generated.

    ROUGE-L is a measure of the longest common subsequence and is used to assess the quality
    of generated text in comparison to reference text. This function computes the ROUGE-L
    fmeasure for each pair of human and generated reviews and returns the average score.

    Args:
        human_reviews (List[str]): A list of reference reviews written by humans.
        generated_reviews (List[str]): A list of reviews generated by a model.

    Returns:
        float: The average ROUGE-L fmeasure score across all pairs of human and generated reviews.
    """
    scores = []
    for h_review in human_reviews:
        for g_review in generated_reviews:
            scores.append(ROUGEL_SCORER.score(h_review, g_review)['rougeL'].fmeasure)

    return np.mean(scores)

def calculate_cosine_similarity(human_reviews: List[str], generated_reviews: List[str]) -> float:
    """
    Calculates the cosine similarity between the embeddings of two sets of reviews.

    This function first encodes the reviews into embeddings using a predefined embedding model,
    then calculates the cosine similarity between each pair of human and generated review embeddings,
    and finally returns the average similarity score.

    Args:
        human_reviews (List[str]): A list of reference reviews written by humans.
        generated_reviews (List[str]): A list of reviews generated by a model.

    Returns:
        float: The average cosine similarity score between the embeddings of human and generated reviews.
    """
    human_reviews_embeddings = COSINE_SIM_EMBED.encode(human_reviews)
    generated_reviews_embeddings = COSINE_SIM_EMBED.encode(generated_reviews)

    scores = []
    for h_review, g_review in zip(human_reviews_embeddings, generated_reviews_embeddings):
        scores.append(cosine_similarity([h_review], [g_review])[0][0])

    return np.mean(scores)

In [12]:
rouge_score = calculate_rouge_l(human_review_messages, generated_review_messages)
print("Rouge-L Score:", rouge_score)

Rouge-L Score: 0.1803172009459371


In [13]:
cosine_similarity_score = calculate_cosine_similarity(human_review_messages, generated_review_messages)
print("Cosine Similarity Score:", cosine_similarity_score)

Cosine Similarity Score: 0.5914192


## LLM Evaluation

In [1]:
from openai import OpenAI

api_key = "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"
# Initialize the OpenAI client
client = OpenAI(api_key=api_key)

In [None]:
system_prompt_template = """
Your task is to carefully analyze and accurately match the key concerns raised in two reviews, ensuring a strong correspondence between the matched points. Examine the verbatim closely.

=====Review A: 
{Review_A}

===== 

=====Review B: 
{Review_B}

===== 

Please follow the example JSON format below for matching points. For instance, if point from review A is nearly identical to point from review B, it should look like this:
{{ 
"A3-B2": {{
    "rationale": "/explain why A3 and B2 are nearly identical/",
    "similarity": "/5-10, only an integer/"
}},
...
}}

**Note that you should only match points with a significant degree of similarity in their concerns. Refrain from matching points with only superficial similarities or weak connections.** For each matched pair, rate the similarity on a scale of 5-10:
- 5 Somewhat Related: Points address similar themes but from different angles.
- 6 Moderately Related: Points share a common theme but with different perspectives or suggestions.
- 7 Strongly Related: Points are largely aligned but differ in some details or nuances.
- 8 Very Strongly Related: Points offer similar suggestions or concerns, with slight differences.
- 9 Almost Identical: Points are nearly the same, with minor differences in wording or presentation.
- 10 Identical: Points are exactly the same in terms of concerns, suggestions, or praises.

If no match is found, output an empty JSON object. Provide your output as JSON only.
"""

In [7]:
import json
import re
from typing import List, Tuple

def concatenate_reviews(reviews: List[str], review_type: str) -> str:
    """
    Concatenates a list of reviews into a single string, with each review prefixed by its type and index.

    Args:
        reviews (List[str]): A list of review strings.
        review_type (str): A character ('A' or 'B') indicating the type of the reviews.

    Returns:
        str: A single string containing all reviews, each prefixed by its type and index.

    Raises:
        ValueError: If the review type is not 'A' or 'B'.
    """
    if review_type not in ['A', 'B']:
        raise ValueError("type must be either 'A' or 'B'")
    
    concatenate_msg = ""
    for i, review in enumerate(reviews):
        concatenate_msg += f"{review_type}{i+1}: {review}\n"
    
    return concatenate_msg

def does_key_match_pattern(key: str) -> bool:
    """
    Checks if a given key matches the pattern 'A<integer>-B<integer>' or 'B<integer>-A<integer>'.

    Args:
        key (str): The key string to check.

    Returns:
        bool: True if the key matches the pattern, False otherwise.
    """
    pattern = r'\b(A\d+-B\d+|B\d+-A\d+)\b$'
    return bool(re.fullmatch(pattern, key))

def count_hit(output_content: str) -> int:
    """
    Counts the number of keys in a JSON string that match a specific pattern and have a 'similarity' score within a certain range.

    Args:
        output_content (str): A JSON string containing the output data.

    Returns:
        int: The count of keys that meet the criteria.
    """
    content = json.loads(output_content)
    hit_cnt = 0
    for key in content.keys():
        if does_key_match_pattern(key) and 'similarity' in content[key]:
            score = int(content[key]['similarity'])
            if 7 <= score <= 10:
                hit_cnt += 1
    return hit_cnt

def calculate_hit_rate(human_review_messages: List[str], generated_review_messages: List[str]) -> Tuple[float, float, float, float]:
    """
    Calculates various statistical measures based on the similarity scores between human and generated review messages.

    Args:
        human_review_messages (List[str]): A list of human-written review messages.
        generated_review_messages (List[str]): A list of review messages generated by a model.

    Returns:
        Tuple[float, float, float, float]: A tuple containing the hit rate, Szymkiewicz–Simpson coefficient, 
        Jaccard index, and Sørensen–Dice coefficient.
    """
    reviews_A = concatenate_reviews(human_review_messages, "A")
    reviews_B = concatenate_reviews(generated_review_messages, "B")
    
    system_prompt = system_prompt_template.format(Review_A=reviews_A, Review_B=reviews_B)

    completion = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": system_prompt}
        ]
    )

    hit_cnt = count_hit(completion.choices[0].message)
    total_human_reviews = len(human_review_messages)
    total_generated_reviews = len(generated_review_messages)

    hit_rate = hit_cnt / total_human_reviews
    szymkiewicz_simpson_coefficient = hit_cnt / min(total_human_reviews, total_generated_reviews)
    jaccard_index = hit_cnt / (total_human_reviews + total_generated_reviews)
    sorensen_dice_coefficient = 2 * hit_cnt / (total_human_reviews + total_generated_reviews)

    return hit_rate, szymkiewicz_simpson_coefficient, jaccard_index, sorensen_dice_coefficient

In [12]:
hit_rate, szymkiewicz_simpson_coefficient, jaccard_index, sorensen_dice_coefficient = calculate_hit_rate(human_review_messages, generated_review_messages)
print("Hit Rate:", hit_rate)
print("Szymkiewicz–Simpson Overlap Coefficient:", szymkiewicz_simpson_coefficient)
print("Jaccard Index:", jaccard_index)
print("Sørensen–Dice Coefficient:", sorensen_dice_coefficient)

Hit Rate: 0.5
Szymkiewicz–Simpson Overlap Coefficient: 1.0
Jaccard Index: 0.3333333333333333
Sørensen–Dice Coefficient: 0.6666666666666666
