# Initialize

In [None]:
!pip install openai -q
!pip install sqlitedict -q

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title  {"form-width":"20%"}
task = "iclr23" # @param ["iclr23", "peer_grading"]
working_dir = f"/content/drive/MyDrive/llmserver/"
task_dir = f"/content/drive/MyDrive/llmserver/task_{task}/"
llmgen_cache_dir = working_dir + "cache_llmgen/"
logprobs_cache_dir = working_dir + "cache_logprobs/"
finetune_cache_dir = working_dir + "cache_finetune/"
dataset_dir = task_dir + "dataset/"
result_dir = task_dir + "result/"

import os
current_directory = os.getcwd()
print(f"current_directory: {current_directory}")
print(f"task_directory: {task_dir}")

# Utils

In [None]:
import os
import sys
import openai
import csv
import numpy as np
import json
import hashlib
import requests
import argparse
import io
import pickle
import unicodedata
import threading
import datetime

openrouter_client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="API_KEY",
)
openrouter_chat = openrouter_client.chat.completions


additional_key = ["API_KEY0",
    "API_KEY1",
    "API_KEY2",
    "API_KEY3",
    "API_KEY4",
    "API_KEY5",
    "API_KEY6",
    "API_KEY7"
]
additional_openrouter_chat = [openai.OpenAI(base_url="https://openrouter.ai/api/v1", api_key=additional_key[i]).chat.completions for i in range(8)]

cache_lock = threading.Lock()  # lock to keep the cache operations safe

def generate_cache_key(params):
    '''
    # generate a unique cache key based on the request parameters
    ## md5 seems to be enough
    params: dict, request parameters
    '''
    params_string = json.dumps(params, sort_keys=True)
    return hashlib.sha256(params_string.encode('utf-8')).hexdigest()

def call_api(compl, params, cache_db, use_cache=True):
    '''
    # call the api with given parameters
    ## thread safe
    params: dict, the parameters
    use_cache: bool, true if try to load the output in cache
    '''

    key = generate_cache_key(params)
    with cache_lock:
        if (cache_db is not None) and use_cache and (key in cache_db): return cache_db[key]

    response = compl.create(**params)

    with cache_lock:
        if cache_db is not None: cache_db[key] = response

    return response


def simple_call_api(system_text, user_text, model, cache_db, max_tokens=4000, temperature=0, use_additional_key=-1):
    '''
    # simplified api call
    ## thread safe
    '''

    api_option = "openrouter"  # openrouter is always better if you do not need logprob

    compl = openrouter_chat if use_additional_key == -1 else additional_openrouter_chat[use_additional_key]

    messages = [{
        "role": "system",
        "content": system_text,
    }, {
        "role": "user",
        "content": user_text,
    }] if system_text != "" else [{
        "role": "user",
        "content": user_text,
    }]

    params = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        "frequency_penalty": 0,
        "presence_penalty": 0
    }

    response = call_api(compl, params, cache_db)
    if response.choices is None:  # an error occurs (with high probability, the error is caused by censorship)
        error_code = response.error["code"]
        print("Error occurs (%d)" % (error_code))
        print("Cache key:", generate_cache_key(params))
        if error_code != 403:  # censorship
            print("Try again!")
            response = call_api(compl, params, cache_db, use_cache = False)
        if response.choices is None:
            print(response)
            raise Exception("Error while calling api")
        else:
            print("Success")
    return response.choices[0].message.content

# Generate LLM Review

In [None]:
# @title  {"form-width":"20%"}
import pandas as pd
import json
import re
import math
import argparse
from scipy import stats
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import time
import threading
import os
import sqlitedict

models = ["review1", "review3"] \
        + ["gpt4omini", "gpt4o", "gpt4turbo", "gpt35turbo"] \
        + ["claude2", "claude3haiku", "claude3sonnet", "claude3opus"] \
        + ["geminipro1", "geminipro15"] \
        + ["llama3-8b", "llama3-70b"] \
        + ["llama31-8b", "llama31-70b", "llama31-405b"] \
        + ["mixtral-7b", "mixtral-8x7b", "mixtral-8x22b"] \



prefix = 300 # @param integer

extend = False # @param {type:"boolean"}

for model in models:

    model_names = {
        "nothing": "nothing",
        "appreciate": "appreciate",
        "criticize": "criticize",
        "both": "both",
        "review1": "human_review1",
        "review2": "human_review2",
        "review3": "human_review3",
        "gpt4omini": "openai/gpt-4o-mini",
        "gpt4o": "openai/gpt-4o",
        "gpt4turbo": "openai/gpt-4-turbo",
        "gpt35turbo": "openai/gpt-3.5-turbo-0125",
        "claude2": "anthropic/claude-2",
        "claude3haiku": "anthropic/claude-3-haiku",
        "claude3sonnet": "anthropic/claude-3-sonnet",
        "claude3opus": "anthropic/claude-3-opus",
        "geminipro1": "google/gemini-pro",
        "geminipro15": "google/gemini-pro-1.5",
        "llama2-70b": "meta-llama/llama-2-70b-chat",
        "llama3-8b": "meta-llama/llama-3-8b-instruct",
        "llama3-70b": "meta-llama/llama-3-70b-instruct",
        "mixtral-7b": "mistralai/mistral-7b-instruct:nitro",
        "mixtral-8x7b": "mistralai/mixtral-8x7b-instruct",
        "mixtral-8x22b": "mistralai/mixtral-8x22b-instruct",
        "wizardlm2-7b": "microsoft/wizardlm-2-8x7b",
        "wizardlm2-8x22b": "microsoft/wizardlm-2-8x22b",
        "o1": "openai/o1-preview",
        "o1mini": "openai/o1-mini",
        "llama31-8b": "meta-llama/llama-3.1-8b-instruct",
        "llama31-70b": "meta-llama/llama-3.1-70b-instruct",
        "llama31-405b": "meta-llama/llama-3.1-405b-instruct",
    }
    if model in model_names: model = model_names[model]

    llmgen_db = sqlitedict.SqliteDict(f"{llmgen_cache_dir}llmgen_{model.replace('/', '--')}.sqlite", autocommit=True)
    dataset_paper = pd.read_parquet(dataset_dir + "dataset_paper.parquet")

    print(f"model: {model}")
    print(f"prefix: {min(prefix,len(dataset_paper))}/{len(dataset_paper)}")

    prompts = {}
    for filename in os.listdir(task_dir + "prompt/"):
        if filename.endswith('.txt'):
            file_path = task_dir + "prompt/" + filename
            with open(file_path, 'r', encoding='utf-8') as f:
                prompts[filename[:-4]] = f.read()

    def get_llm_review(paper_text, model, use_key = -1):
        return simple_call_api(prompts["get_llm_review"], paper_text, model, cache_db = llmgen_db, use_additional_key = use_key)

    def get_review_summary(review_text, use_key = -1):
        model="openai/gpt-4o"
        return simple_call_api(prompts["get_review_summary"], review_text, model, cache_db = llmgen_db, use_additional_key = use_key)

    def get_review_elongation(review_text, use_key = -1):
        model="anthropic/claude-3-sonnet"
        return simple_call_api(prompts["elongate_review"], review_text, model, cache_db = llmgen_db, use_additional_key = use_key)

    def get_review_restatement(review_text, use_key = -1):
        model="openai/gpt-4o"
        return simple_call_api(prompts["restate_review"], review_text, model, cache_db = llmgen_db, use_additional_key = use_key)

    def get_llama_restatement(review_text, use_key = -1):
        model="meta-llama/llama-3-70b-instruct"
        return simple_call_api(prompts["restate_review"], review_text, model, cache_db = llmgen_db, use_additional_key = use_key)

    def get_abstract_review(paper_text, use_key = -1):
        model="openai/gpt-4o"
        return simple_call_api(prompts["get_abstract_review"], paper_text, model, cache_db = llmgen_db, use_additional_key = use_key)

    def get_stupid_review_elongation(review_text):
        extra_comments = {
            "Summary Of The Paper:\n\n": "The authors present a study that aims to address an important issue in the field. Their approach involves a combination of methods to investigate the research question. The paper covers several aspects of the topic and provides insights into the subject matter.\n\n",
            "Strength And Weaknesses:\n\n": "The study demonstrates both positive aspects and areas for potential improvement. The strengths of the paper contribute to its overall value, while addressing the weaknesses could enhance the quality of the research. It's important to consider both elements when evaluating the work.\n\n",
            "Clarity, Quality, Novelty And Reproducibility:\n\n": "The presentation of the research is a crucial aspect of any scientific paper. The clarity of exposition, overall quality of the work, novelty of the approach, and potential for reproducibility are all important factors to consider when assessing the contribution of the study to the field.\n\n",
            "Summary Of The Review:\n\n": "In conclusion, this review aims to provide a balanced assessment of the paper, taking into account its various aspects. The feedback provided is intended to be constructive and to assist in improving the overall quality of the research. It is hoped that this review will be helpful to both the authors and the editorial team in making decisions about the manuscript.\n\n"
        }

        for section, extra_comment in extra_comments.items():
            section_start = review_text.find(section)
            if section_start != -1:
                section_end = review_text.find("\n\n", section_start) + 2
                if section_end == -1 + 2:
                    section_end = len(review_text)

                review_text = (
                    review_text[:section_end] +
                    extra_comment +
                    review_text[section_end:]
                )

        return review_text

    def get_stupid_review_elongation_v2(review_text):

        def get_review_strength(review_text):
            model="openai/gpt-4o"
            return simple_call_api(prompts["get_review_strength"], review_text, model, cache_db = llmgen_db, use_additional_key = -1)

        def get_review_weakness(review_text):
            model="openai/gpt-4o"
            return simple_call_api(prompts["get_review_weakness"], review_text, model, cache_db = llmgen_db, use_additional_key = -1)

        strength_summary = get_review_strength(review_text)
        weakness_summary = get_review_weakness(review_text)

        extra_comments = {
            "Summary Of The Paper:\n\n": "This section provides an overview of the key contributions, methodologies, and findings presented in the paper. It summarizes the main arguments and highlights the scope and significance of the research conducted. Specifically, ",
            "Strength And Weaknesses:\n\n": f"In this section, I will discuss the strengths and weaknesses of the paper, focusing on its contributions to the field, methodological rigor, and areas where improvements could be made. The aim is to provide constructive feedback that can help enhance the quality of the work. This paper has several notable strengths, including {strength_summary}. However, there are some areas that could be improved, such as {weakness_summary}. Specifically, I discuss the strengths and weaknesses of the paper as following.\n",
            "Clarity, Quality, Novelty And Reproducibility:\n\n": "I will evaluate the clarity of the paper’s presentation, the quality of the research, the novelty of the findings, and the reproducibility of the experiments. This assessment will address whether the research is presented in a clear and coherent manner, offers new insights, and can be replicated based on the provided information. ",
            "Summary Of The Review:\n\n": "This summary encapsulates the main points of the review, reflecting on the paper’s contributions, strengths, and areas for improvement. It provides a balanced overview of the paper’s impact and offers final recommendations for the authors. "
        }

        for section, extra_comment in extra_comments.items():
            section_start = review_text.find(section)
            if section_start != -1:
                section_end = review_text.find("\n\n", section_start) + 2
                if section_end == -1 + 2:
                    section_end = len(review_text)

                review_text = (
                    review_text[:section_end] +
                    extra_comment +
                    review_text[section_end:]
                )

        return review_text

    def get_review_degradation(review_text):
        sentences = []
        parsed_sentences = []
        sentence = ""
        cnt = 0
        for char in review_text:
            sentence += char
            if char in ".?!\n":
                if len(sentence.strip())>0:
                  sentence = sentence.strip(" ")
                  sentences.append(sentence)
                  if "summary of the paper" not in sentence.lower() and "summary of the review" not in sentence.lower() and "strength and weaknesses" not in sentence.lower() and "clarity, quality, novelty and reproducibility" not in sentence.lower():
                    cnt += 1
                    if cnt % 2 == 1:
                      parsed_sentences.append(sentence)
                  else:
                    parsed_sentences.append(f"\n{sentence}\n")
                    cnt = 0
                sentence = ""
        parsed_sentences = " ".join(parsed_sentences)
        return parsed_sentences

    def get_review_degradation_v2(review_text, use_key = -1):
        sentences = []
        parsed_sentences = []
        sentence = ""
        cnt = 0
        for char in review_text:
            sentence += char
            if char in ".?!\n":
                if len(sentence.strip())>0:
                  sentence = sentence.strip(" ")
                  sentences.append(sentence)
                  if "summary of the paper" not in sentence.lower() and "summary of the review" not in sentence.lower() and "strength and weaknesses" not in sentence.lower() and "clarity, quality, novelty and reproducibility" not in sentence.lower() and "**" not in sentence:
                    cnt += 1
                    if cnt % 2 == 1:
                      parsed_sentences.append(sentence)
                    else:
                      if sentence.endswith("\n"):
                        parsed_sentences.append(f"[There is one missing sentence]\n")
                      else:
                        parsed_sentences.append(f"[There is one missing sentence]")
                  else:
                    parsed_sentences.append(f"\n{sentence}\n")
                    cnt = 0
                sentence = ""
        parsed_sentences = " ".join(parsed_sentences)

        model="openai/gpt-4o"
        return simple_call_api(prompts["complete_review"], parsed_sentences, model, cache_db = llmgen_db, use_additional_key = use_key)

    def gen_dataset(mod16, progress_bar=None):
        columns=["paper_id", "model", "review", "review_summary"]
        if extend:
            columns += ["review_elongation", "review_elongation_summary"]
            columns += ["review_restatement", "review_restatement_summary"]
            columns += ["review_degradation", "review_degradation_summary"]
            columns += ["test","test_summary"]
            columns += ["meaningless_elongation","meaningless_elongation_summary"]
            columns += ["llama3_restatement","llama3_restatement_summary"]
            columns += ["degradation_v2","degradation_v2_summary"]
            columns += ["abstract_review","abstract_review_summary"]
        dataset_llm_review = pd.DataFrame(columns = columns)
        for i in range(min(prefix,len(dataset_paper))):
            if i % 16 != mod16 and mod16 != -1: continue

            row = dataset_paper.loc[i]
            paper_id = row["paper_id"]
            paper_abstract = row["abstract"]
            paper_text = row["parsed_text"]

            non_llm = {
                "nothing": "Not available\n",
                "appreciate": "The reviewer appreciates the overall quality.\n",
                "criticize": "The reviewer criticize the overall quality.\n",
                "both": "The reviewer appreciates the overall quality.\nThe reviewer criticize the overall quality.\n",
            }

            if model.startswith("human_review") and pd.isnull(row[model]):
                if mod16 == -1:
                    col = [paper_id, model, None, None]
                    if extend: col += [None, None, None, None, None, None]
                    dataset_llm_review.loc[len(dataset_llm_review)] = col
                if progress_bar: progress_bar.update(1)
                continue

            if model.startswith("human_review"):
                llm_review = row[model]
                review_summary = get_review_summary(llm_review, mod16 % 8)
            elif model in non_llm:
                llm_review = non_llm[model]
                review_summary = llm_review
            else:
                llm_review = get_llm_review(paper_text, model, mod16 % 8)
                review_summary = get_review_summary(llm_review, mod16 % 8)

            if extend:
                review_elongation = get_review_elongation(llm_review)
                review_elongation_summary = get_review_summary(review_elongation, mod16 % 8)

                review_restatement = get_review_restatement(llm_review)
                review_restatement_summary = get_review_summary(review_restatement, mod16 % 8)

                review_degradation = get_review_degradation(llm_review)
                review_degradation_summary = get_review_summary(review_degradation, mod16 % 8)

                test = get_stupid_review_elongation(llm_review)
                test_summary = get_review_summary(test, mod16 % 8)

                meaningless_elongation = get_stupid_review_elongation_v2(llm_review)
                meaningless_elongation_summary = get_review_summary(meaningless_elongation, mod16 % 8)

                llama_restatement = get_llama_restatement(llm_review, mod16 % 8)
                llama_restatement_summary = get_review_summary(llama_restatement, mod16 % 8)

                degradation_v2 = get_review_degradation_v2(llm_review, mod16 % 8)
                degradation_v2_summary = get_review_summary(degradation_v2, mod16 % 8)

                abstract_review = get_abstract_review(paper_abstract, mod16 % 8)
                abstract_review_summary = get_review_summary(abstract_review, mod16 % 8)

            if mod16 == -1:
                col = [paper_id, model, llm_review, review_summary]
                if extend:
                    col += [review_elongation, review_elongation_summary]
                    col += [review_restatement, review_restatement_summary]
                    col += [review_degradation, review_degradation_summary]
                    col += [test, test_summary]
                    col += [meaningless_elongation, meaningless_elongation_summary]
                    col += [llama_restatement, llama_restatement_summary]
                    col += [degradation_v2, degradation_v2_summary]
                    col += [abstract_review, abstract_review_summary]
                dataset_llm_review.loc[len(dataset_llm_review)] = col
            if progress_bar: progress_bar.update(1)

        if mod16 == -1:
            print("Average length (raw)= %.1f"%(dataset_llm_review["review"].apply(len).mean()))
            print("Average length (summary)= %.1f"%(dataset_llm_review["review_summary"].apply(len).mean()))
            dataset_llm_review.to_parquet(f"{dataset_dir}dataset_{model.replace('/', '--')}.parquet", index=False)
        if progress_bar:
            progress_bar.close()
        return

    tmp_lock = threading.Lock()

    def run_worker_with_progress(mod16):
        with tmp_lock:
            progress_bar = tqdm(total=(min(prefix,len(dataset_paper))+15-mod16)//16, desc=f"Thread {mod16}", position=mod16)
        result = gen_dataset(mod16, progress_bar)
        return result

    use_thread = True
    if use_thread:
        with ThreadPoolExecutor(max_workers=16) as executor:
            futures = [executor.submit(run_worker_with_progress, i) for i in range(16)]
            for future in futures:
                future.result()

    gen_dataset(-1)

    print(model, "done!")
