In [5]:
import json

with open('../data/paper_review_data_longqlora_10pct.jsonl') as f:
    reviews = [json.loads(line) for line in f]

In [3]:
from typing import Dict, List, Any

def extract_review_messages(review_data: Dict[str, List[Dict[str, Dict[str, Any]]]]) -> List[Dict[str, Any]]:
    """
    Extracts and returns the review messages including titles, reviews, and ratings from a given review data structure.

    Args:
        review_data (Dict[str, List[Dict[str, Dict[str, Any]]]]): A dictionary containing the review information,
        expected to have a 'reviews_msg' key with a list of dictionaries. Each dictionary should have a 'content'
        key that contains a dictionary with 'title', 'review', and 'rating' keys.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries, each containing 'title', 'review', and 'rating' keys with their
        respective values extracted from the review data.
    """
    review_messages = []
    for message in review_data.get('reviews_msg', []):
        content = message.get('content', {})
        if all(key in content for key in ['title', 'review', 'rating']):
            review_messages.append({
                'title': content['title'],
                'review': content['review'],
                'rating': content['rating']
            })

    return review_messages

In [4]:
review_messages = extract_review_messages(reviews[0])

In [4]:
print("Length of review messages:", len(review_messages))

Length of review messages: 3


# Evaluation Dataset

In [48]:
import pandas as pd 
import json

In [3]:
df = pd.read_csv('../toy_data_w_gpt_reviews.csv')
df.head()

Unnamed: 0,title,url,abstract,authors,review_title,review,rating,gpt-3.5-turbo-reviews,gpt-4-full-reviews
0,Pre-Training by Completing Point Clouds,https://openreview.net/pdf?id=jPSYH47QSZL,There has recently been a flurry of exciting a...,"Hanchen Wang,Qi Liu,Xiangyu Yue,Joan Lasenby,M...",A good one,This paper proposes a better pre-trained prior...,"7: Good paper, accept","{\n ""Significance and novelty"": {\n ...","{\n ""Significance and novelty"": {\n ""Intro..."
1,Pre-Training by Completing Point Clouds,https://openreview.net/pdf?id=jPSYH47QSZL,There has recently been a flurry of exciting a...,"Hanchen Wang,Qi Liu,Xiangyu Yue,Joan Lasenby,M...",paper shows promising results using point clou...,The paper considers the problem of training ne...,"7: Good paper, accept","{\n""Significance and novelty"": {\n ""Use of ...","{\n ""Significance and novelty"": {\n ""Intro..."
2,Pre-Training by Completing Point Clouds,https://openreview.net/pdf?id=jPSYH47QSZL,There has recently been a flurry of exciting a...,"Hanchen Wang,Qi Liu,Xiangyu Yue,Joan Lasenby,M...",Limited novelty and weak improvements,The authors propose completing an occluded poi...,4: Ok but not good enough - rejection,"{\n ""Significance and novelty"": {\n ...","{\n ""Significance and novelty"": {\n ""Novel..."
3,Pre-Training by Completing Point Clouds,https://openreview.net/pdf?id=jPSYH47QSZL,There has recently been a flurry of exciting a...,"Hanchen Wang,Qi Liu,Xiangyu Yue,Joan Lasenby,M...","Since the idea itself is simple enough, the re...",The idea of this paper is simple but fascinati...,5: Marginally below acceptance threshold,"{\n""Significance and novelty"": {\n ""Mask-ba...","{\n ""Significance and novelty"": {\n ""Innov..."
4,GamePad: A Learning Environment for Theorem Pr...,https://openreview.net/pdf?id=r1xwKoR9Y7,"In this paper, we introduce a system called Ga...","Daniel Huang,Prafulla Dhariwal,Dawn Song,Ilya ...",An intriguing integration of ML and automated ...,Summary: This paper mixes automated theorem pr...,"7: Good paper, accept","{\n ""Significance and novelty"": {\n ""Explo...","{\n ""Significance and novelty"": {\n ""Intro..."


In [6]:
from collections import defaultdict

# Initialize a nested dictionary to store reviews for each title
reviews = defaultdict(lambda: {
    'human_reviews': [],
    'gpt-3.5-abstract-reviews': [],
    'gpt-4-full-reviews': []
})

# Iterate over the DataFrame to populate the reviews dictionary
for _, row in df.iterrows():
    title = row['title']

    # Append reviews to the corresponding lists within the dictionary
    reviews[title]['human_reviews'].append(row['review'])
    reviews[title]['gpt-3.5-abstract-reviews'].append(row['gpt-3.5-turbo-reviews'])
    reviews[title]['gpt-4-full-reviews'].append(row['gpt-4-full-reviews'])

In [9]:
titles = list(reviews.keys())
print("Number of titles:", len(titles))

Number of titles: 7


In [10]:
reviews[titles[0]]

{'human_reviews': ["This paper proposes a better pre-trained prior for a variety of downstream applications in point cloud analysis. The workflow of the pre-training mechanism is to first 1) generate occluded points that result from view occlusion and then 2) optimize the encoder to learn how to complete the occluded points from the partial point cloud. In downstream applications, the obtained encoder will be used as the initial weights in the network training. Empirical experiments have shown that such a pre-train mechanism can improve initialization over prior baselines and benefit a variety of tasks even with a large domain gap.\n\nPros:\n1. The experimental results have shown a steady improvement in performance by using the proposed pre-training approach in different encoder architectures and different downstream applications. That provides strong support for validating the effectiveness of the proposed approach.\n2. I also like the result that the initialization is only pre-traine

# LLM Evaluation

In [16]:
from openai import OpenAI

api_key = "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"
# Initialize the OpenAI client
client = OpenAI(api_key=api_key)

## Review Summary

In [30]:
SUMMARY_PROMPT = """
Your goal is to identify the key concerns raised in the review, focusing only on potential
reasons for rejection.

Please provide your analysis in JSON format, including a concise summary, and the exact
wording from the review. 
    
Submission Title: {Title}

=====Review:
```
{Review_Text}
```
=====

Example JSON format:
{{
    "1": {{"summary": "<your concise summary>", "verbatim": "<concise, copy the exact
    wording in the review>"}},
    "2": ... 
}}

Analyze the review and provide the key concerns in the format specified above. Ignore minor
    issues like typos and clarifications. Output only json.
"""

In [64]:
from typing import List

def clean_json_output(output: str) -> str:
    """
    Cleans the JSON output from the OpenAI GPT-4 model by removing unnecessary characters.

    This function aims to strip extraneous formatting or characters from the JSON output,
    such as backticks or leading 'json' strings that might be present in the formatted output.

    Args:
        output: A string containing the JSON-formatted output from the GPT-4 model.

    Returns:
        A string with the JSON output cleaned up.
    """
    # Remove backticks, 'json' literals, and any leading/trailing whitespace
    cleaned_output = output.strip('`').replace('json\n', '').strip()
    return cleaned_output


def summary_reviews(reviews: List[str], title: str) -> str:
    """
    Summarizes a list of reviews using the OpenAI GPT-4 model.

    Args:
        reviews: A list of strings containing the reviews to be summarized.
        title: The title of the subject to which the reviews pertain.

    Returns:
        A string representing the JSON-formatted summary of the reviews.
    """
    
    start_delimiter = "\n<|startofreview|>\n"
    end_delimiter = "\n<|endofreview|>\n"
    review_messages = end_delimiter.join(start_delimiter + review for review in reviews) + end_delimiter 
    
    prompt = SUMMARY_PROMPT.format(Title=title, Review_Text=review_messages)
    completion = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": prompt}
        ]
    )
    
    response = clean_json_output(completion.choices[0].message.content)

    return response

In [65]:
title = titles[0]
human_reviews = reviews[title]['human_reviews']
gpt_35_reviews = reviews[title]['gpt-3.5-abstract-reviews']
gpt_4_reviews = reviews[title]['gpt-4-full-reviews']

In [66]:
human_reviews_summary = summary_reviews(human_reviews, title)
gpt_35_reviews_summary = summary_reviews(gpt_35_reviews, title)
gpt_4_reviews_summary = summary_reviews(gpt_4_reviews, title)

In [76]:
print(gpt_4_reviews)

['{\n  "Significance and novelty": {\n    "Introduction of Occlusion Completion (OcCo) for point cloud pre-training": "The paper introduces a novel pre-training method named Occlusion Completion (OcCo), which enhances point cloud models by learning to reconstruct occluded parts of point clouds. This approach is inspired by mask-based pre-training in natural language processing and adapts it innovatively for 3D point cloud data. The novelty lies in using occlusion as a mechanism for pre-training, aiming to improve semantic understandings and generalization on downstream tasks.",\n    "Leveraging occluded point clouds for learning structural information": "The method uniquely leverages the natural occurrence of occlusions in point clouds obtained from different camera viewpoints to generate occluded point clouds. This approach forces the model to learn structural and contextual information about the objects, which is a novel strategy for pre-training point cloud models."\n  },\n  "Potent

In [71]:
print("Human Reviews Summary: \n" + human_reviews_summary)

Human Reviews Summary: 
{
    "1": {
        "summary": "Incremental improvements and lack of in-depth analysis on the effectiveness of the pre-training mechanism.",
        "verbatim": "The improvement, as shown in the statistics, is very incremental in most cases. [...] I would appreciate it if a more in-depth analysis of why such a pre-training mechanism could work is provided."
    },
    "2": {
        "summary": "Overselling the novelty, inadequate experimental details, and missing crucial information for understanding the results.",
        "verbatim": "I feel that the paper oversells the novelty of the OcCo task [...] misses relevant work [...] and misses crucial details necessary to understand the experimental results."
    },
    "3": {
        "summary": "Limited novelty and experimental reliability.",
        "verbatim": "The novelty of the paper seems rather weak. [...] The experimental results seem weak as well."
    },
    "4": {
        "summary": "Need for deeper analy

In [70]:
print("GPT-3.5 Reviews Summary: \n" + gpt_35_reviews_summary)

GPT-3.5 Reviews Summary: 
{
    "1": {
        "summary": "The paper lacks a detailed comparative analysis with existing methods and needs more extensive experimental results.",
        "verbatim": "The paper should provide a more detailed comparative analysis with existing methods in the field to further validate the effectiveness of the OcCo method. More extensive experimental results and evaluation on various datasets are needed to fully demonstrate the effectiveness of the proposed method."
    },
    "2": {
        "summary": "The paper may benefit from further experimental validation and lacks comparison with state-of-the-art methods.",
        "verbatim": "The paper may benefit from further experimental validation to ensure the effectiveness of the proposed method. Comparative analysis with existing state-of-the-art methods could strengthen the paper."
    }
}


In [74]:
print("GPT-4 Reviews Summary: \n" + gpt_4_reviews_summary)

GPT-4 Reviews Summary: 
{
    "1": {
        "summary": "The paper lacks a detailed theoretical analysis explaining why OcCo pre-training improves generalization and performance on downstream tasks.",
        "verbatim": "The paper primarily focuses on empirical results and may lack a detailed theoretical analysis explaining why OcCo pre-training leads to better generalization and performance on downstream tasks."
    },
    "2": {
        "summary": "The paper might not compare its approach against very recent or concurrent works in point cloud processing.",
        "verbatim": "Depending on the timing and scope of the review, the paper might not compare its approach against the very latest or concurrent works in point cloud processing, which could be seen as a limitation."
    },
    "3": {
        "summary": "Lacks a robust theoretical framework explaining why OcCo works effectively.",
        "verbatim": "The paper's strong empirical results might be seen as lacking a robust theore

## Compare Review

In [43]:
REVIEW_COMPARISON_RPOMPT = """
Your task is to carefully analyze and accurately match the key concerns raised in two reviews, 
ensuring a strong correspondence between the matched points. Examine the verbatim closely.

=====Review A: 
{Review_A}

===== 

=====Review B: 
{Review_B}

===== 

Please follow the example JSON format below for matching points. For instance, if point from review A is nearly identical to point from review B, it should look like this:
{{ 
    "A3-B2": {{"rationale": "<explain why A3 and B2 are nearly identical>","similarity": "<5-10, only an integer>"}},
    ...
}}

**Note that you should only match points with a significant degree of similarity in their concerns. Refrain from matching points with only superficial similarities or weak connections.** For each matched pair, rate the similarity on a scale of 5-10:
- 5 Somewhat Related: Points address similar themes but from different angles.
- 6 Moderately Related: Points share a common theme but with different perspectives or suggestions.
- 7 Strongly Related: Points are largely aligned but differ in some details or nuances.
- 8 Very Strongly Related: Points offer similar suggestions or concerns, with slight differences.
- 9 Almost Identical: Points are nearly the same, with minor differences in wording or presentation.
- 10 Identical: Points are exactly the same in terms of concerns, suggestions, or praises.

If no match is found, output an empty JSON object. Provide your output as JSON only.
"""

In [82]:
def compare_reviews(human_reviews: str, gpt_reviews: str) -> str:
    """
    Compares two sets of reviews and identifies matching points between them.

    Args:
        human_reviews: A string containing the JSON summary of human-written reviews.
        gpt_reviews: A string containing the JSON summary of reviews generated by GPT-4.

    Returns:
        A string representing the JSON-formatted comparison of the reviews.
    """
    prompt = REVIEW_COMPARISON_RPOMPT.format(
        Review_A=human_reviews,
        Review_B=gpt_reviews
    )

    completion = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": prompt}
        ]
    )
    
    response = clean_json_output(completion.choices[0].message.content)

    return response

In [83]:
human_vs_gpt_35 = compare_reviews(human_reviews_summary, gpt_35_reviews_summary)

In [84]:
print("Human vs. GPT-3.5 Reviews Comparison: \n" + human_vs_gpt_35)

Human vs. GPT-3.5 Reviews Comparison: 
{
    "A1-B1": {
        "rationale": "Both points express concerns about the lack of extensive experimental validation and comparative analysis with other methods to demonstrate the effectiveness of the OcCo method. While A1 emphasizes the need for a deeper analysis on the effectiveness of the pre-training mechanism, B1 specifically mentions the need for more detailed comparative analysis and extensive experimental results. The core concern in both cases is the necessity to better establish the effectiveness of the method through comparison and analysis.",
        "similarity": "7"
    },
    "A2-B2": {
        "rationale": "Both points critique the paper for not providing enough experimental validation and lack of comparison with state-of-the-art methods. A2 discusses overselling the novelty and missing details necessary for understanding the experimental results, while B2 emphasizes the need for further experimental validation and comparative a

In [85]:
human_vs_gpt_4 = compare_reviews(human_reviews_summary, gpt_4_reviews_summary)
print("Human vs. GPT-4 Reviews Comparison: \n" + human_vs_gpt_4)

Human vs. GPT-4 Reviews Comparison: 
{
    "A1-B1": {
        "rationale": "Both points criticize the lack of detailed theoretical analysis to explain why the OcCo pre-training mechanism improves generalization and performance. While A focuses on the incremental improvements and seeks more in-depth analysis, B emphasizes the absence of a detailed theoretical groundwork that explains the performance improvements.",
        "similarity": "7"
    },
    "A1-B3": {
        "rationale": "Both points share concerns about the lack of theoretical framework or detailed analysis explaining the effectiveness of the OcCo pre-training mechanism. The essence of both criticisms is the need for a deeper understanding or explanation beyond empirical results.",
        "similarity": "8"
    },
    "A1-B5": {
        "rationale": "Both points express a desire for a solid theoretical foundation to explain the effectiveness of the OcCo method. They both highlight a gap in understanding 'why' the method wor

In [96]:
def count_hits(reviews_comparison: str, threshold: int=7) -> int:
    """
    Counts the number of unique high-similarity hits in the reviews comparison.

    This function parses a JSON-formatted string of reviews comparison,
    checks each item for a similarity score of the threshold or higher.

    Args:
        reviews_comparison: A string containing the JSON-formatted comparison data.
        threshold: An integer representing the minimum similarity score for a hit. Default is 7.

    Returns:
        The count of unique high-similarity hits based on the specified threshold.
    """
    comparison = json.loads(reviews_comparison)
    prefixes = set()
    hit_count = 0

    for key, value in comparison.items():
        similarity = int(value['similarity'])
        prefix = key[:2].lower()
        if similarity >= threshold and prefix.startswith('a') and prefix not in prefixes:
            prefixes.add(prefix)
            hit_count += 1

    return hit_count

In [97]:
gpt35_hit = count_hits(human_vs_gpt_35)
print("Number of hits between human and GPT-3.5 reviews:", gpt35_hit)

Number of hits between human and GPT-3.5 reviews: 3


In [98]:
gpt4_hit = count_hits(human_vs_gpt_4)
print("Number of hits between human and GPT-4 reviews:", gpt4_hit)

Number of hits between human and GPT-4 reviews: 2


In [102]:
from typing import Tuple

def calculate_hit_rates(title: str, human_reviews: List[str], gpt_reviews: List[str]) -> Tuple[int, float]:
    """
    Calculates the hit rate based on the similarity scores between human and GPT-generated reviews.

    The hit rate is the proportion of high-similarity hits in the human reviews when compared
    to the GPT-generated reviews, where a "hit" is defined by the `count_hits` function logic.

    Args:
        title: The title of the paper being reviewed.
        human_reviews: A list of strings containing human-written review messages.
        gpt_reviews: A list of strings containing GPT-generated review messages.

    Returns:
        A tuple containing the total number of high-similarity hits and the hit rate.
    """
    human_reviews_summary = summary_reviews(human_reviews, title)
    gpt_reviews_summary = summary_reviews(gpt_reviews, title)
    comparison = compare_reviews(human_reviews_summary, gpt_reviews_summary)

    hit_cnt = count_hits(comparison)
    total_human_reviews = len(human_reviews)  # Counting the original reviews instead of the summary

    hit_rate = hit_cnt / total_human_reviews if total_human_reviews > 0 else 0

    return hit_cnt, hit_rate

# Evalute

In [101]:
titles 

['Pre-Training by Completing Point Clouds',
 'GamePad: A Learning Environment for Theorem Proving',
 'Generalisation and the Geometry of Class Separability',
 'PERIL: Probabilistic Embeddings for hybrid Meta-Reinforcement and Imitation Learning',
 'C-Learning: Learning to Achieve Goals via Recursive Classification',
 'Unsupervised Learning of Node Embeddings by Detecting Communities']

In [103]:
eval_result = []

for title in titles:
    human_reviews = reviews[title]['human_reviews']
    gpt_35_reviews = reviews[title]['gpt-3.5-abstract-reviews']
    gpt_4_reviews = reviews[title]['gpt-4-full-reviews']

    hit_cnt_35, hit_rate_35 = calculate_hit_rates(title, human_reviews, gpt_35_reviews)
    hit_cnt_4, hit_rate_4 = calculate_hit_rates(title, human_reviews, gpt_4_reviews)
    
    eval_result.append({
        'title': title,
        'hit_cnt_35': hit_cnt_35,
        'hit_rate_35': hit_rate_35,
        'hit_cnt_4': hit_cnt_4,
        'hit_rate_4': hit_rate_4
    })

    print(f"Title: {title}")
    print(f"GPT-3.5 Hit Count: {hit_cnt_35}, Hit Rate: {hit_rate_35}")
    print(f"GPT-4 Hit Count: {hit_cnt_4}, Hit Rate: {hit_rate_4}")
    print("=====================================")

Title: Pre-Training by Completing Point Clouds
GPT-3.5 Hit Count: 2, Hit Rate: 0.5
GPT-4 Hit Count: 1, Hit Rate: 0.25
Title: GamePad: A Learning Environment for Theorem Proving
GPT-3.5 Hit Count: 2, Hit Rate: 0.6666666666666666
GPT-4 Hit Count: 1, Hit Rate: 0.3333333333333333
Title: Generalisation and the Geometry of Class Separability
GPT-3.5 Hit Count: 1, Hit Rate: 0.5
GPT-4 Hit Count: 1, Hit Rate: 0.5
GPT-3.5 Hit Count: 0, Hit Rate: 0.0
GPT-4 Hit Count: 0, Hit Rate: 0.0
Title: PERIL: Probabilistic Embeddings for hybrid Meta-Reinforcement and Imitation Learning
GPT-3.5 Hit Count: 4, Hit Rate: 1.0
GPT-4 Hit Count: 2, Hit Rate: 0.5
Title: C-Learning: Learning to Achieve Goals via Recursive Classification
GPT-3.5 Hit Count: 0, Hit Rate: 0.0
GPT-4 Hit Count: 2, Hit Rate: 0.4
Title: Unsupervised Learning of Node Embeddings by Detecting Communities
GPT-3.5 Hit Count: 1, Hit Rate: 0.3333333333333333
GPT-4 Hit Count: 1, Hit Rate: 0.3333333333333333


In [110]:
eval_result_df = pd.DataFrame(eval_result)
eval_result_df

Unnamed: 0,title,hit_cnt_35,hit_rate_35,hit_cnt_4,hit_rate_4
0,Pre-Training by Completing Point Clouds,2,0.5,1,0.25
1,GamePad: A Learning Environment for Theorem Pr...,2,0.666667,1,0.333333
2,Generalisation and the Geometry of Class Separ...,1,0.5,1,0.5
3,"Using Anomaly Feature Vectors for Detecting, C...",0,0.0,0,0.0
4,PERIL: Probabilistic Embeddings for hybrid Met...,4,1.0,2,0.5
5,C-Learning: Learning to Achieve Goals via Recu...,0,0.0,2,0.4
6,Unsupervised Learning of Node Embeddings by De...,1,0.333333,1,0.333333


In [111]:
eval_result_df['Number of Human Reviews'] = [len(reviews[title]['human_reviews']) for title in titles]
eval_result_df['Number of Human Reviews'] = eval_result_df['Number of Human Reviews'].apply(lambda x: int(x) if not pd.isnull(x) else 0)
eval_result_df.columns = ['Paper Title', 'GPT-4 Abstract Hit Count', 'GPT-4 Abstract Hit Rate', 'GPT-4 Full Content Hit Count', 'GPT-4 Full Content Hit Rate', 'Number of Human Reviews']
eval_result_df

Unnamed: 0,Paper Title,GPT-4 Abstract Hit Count,GPT-4 Abstract Hit Rate,GPT-4 Full Content Hit Count,GPT-4 Full Content Hit Rate,Number of Human Reviews
0,Pre-Training by Completing Point Clouds,2,0.5,1,0.25,4
1,GamePad: A Learning Environment for Theorem Pr...,2,0.666667,1,0.333333,3
2,Generalisation and the Geometry of Class Separ...,1,0.5,1,0.5,2
3,"Using Anomaly Feature Vectors for Detecting, C...",0,0.0,0,0.0,1
4,PERIL: Probabilistic Embeddings for hybrid Met...,4,1.0,2,0.5,4
5,C-Learning: Learning to Achieve Goals via Recu...,0,0.0,2,0.4,5
6,Unsupervised Learning of Node Embeddings by De...,1,0.333333,1,0.333333,3


In [112]:
print('Average GPT-4 Abstract Hit Rate:', eval_result_df['GPT-4 Abstract Hit Rate'].mean())
print('Average GPT-4 Full Content Hit Rate:', eval_result_df['GPT-4 Full Content Hit Rate'].mean())

Average GPT-4 Abstract Hit Rate: 0.42857142857142855
Average GPT-4 Full Content Hit Rate: 0.330952380952381
