# Initialize

In [None]:
!pip install openai
!pip install PyPDF2

from google.colab import drive
drive.mount('/content/drive')

working_dir = "/content/drive/MyDrive/llmserver/task_iclr23/raw/"
dataset_dir = "/content/drive/MyDrive/llmserver/task_iclr23/dataset/"

# Crawl Raw ICLR Data

In [None]:
import requests
import json
import csv

BASE_URL = "https://api.openreview.net/notes?invitation=ICLR.cc/2023/Conference/-/Blind_Submission&details=directReplies"


def get_iclr_reviews(paper_info_filename=None, review_info_filename=None, decision_info_filename=None):
    offset = 0
    limit = 100  # number of paper per request
    max_limit = float('inf')
    paper_list = []
    review_list = []
    decision_list = []

    while offset + limit <= max_limit:
        print(f'Scraping review data: {offset}/{max_limit}')
        print("start request")
        response = requests.get(BASE_URL, params={"offset": offset, "limit": limit})
        print("end request")
        if response.status_code != 200:
            print("Error:", response.status_code)
            break

        data = response.json()
        if not data or 'notes' not in data or not data['notes']:
            break

        for note in data['notes']:
            paper = {}

            paper['uid'] = note['id']
            paper['number'] = note['number']
            paper['title'] = note['content']['title']
            paper['authors'] = note['content']['authors']
            paper['abstract'] = note['content']['abstract']
            paper['pdf'] = note['content']['pdf']
            paper['keywords'] = note['content']['keywords']

            if 'details' in note and 'directReplies' in note['details']:
                paper_list.append(paper)
                for directReplies in note['details']['directReplies']:


                    if directReplies['invitation'].endswith("Official_Review"):

                        review = {}
                        review['uid'] = directReplies['id']
                        review['paper_uid'] = paper['uid']
                        review['paper_title'] = paper['title']
                        review.update(directReplies['content'])

                        review_list.append(review)

                    if directReplies['invitation'].endswith("Decision"):

                        decision = {}
                        decision['uid'] = directReplies['id']
                        decision['paper_uid'] = paper['uid']
                        decision['paper_title'] = paper['title']
                        decision.update(directReplies['content'])

                        decision_list.append(decision)

        offset += limit


    with open(paper_info_filename, 'w', encoding = "utf-8") as f:
        json.dump(paper_list, f)
    with open(review_info_filename, 'w', encoding = "utf-8") as f:
        json.dump(review_list, f)
    with open(decision_info_filename, 'w', encoding = "utf-8") as f:
        json.dump(decision_list, f)


if __name__ == "__main__":
    get_iclr_reviews(paper_info_filename = working_dir + "ICLR2023paper_raw.json",
                     review_info_filename = working_dir + "ICLR2023review_raw.json",
                     decision_info_filename= working_dir + "ICLR2023decision_raw.json")

    print("Finished scraping ICLR 2023 reviews!")


# Build ICLR Raw Dataset

In [None]:
import os
import csv
import random
import json
import argparse
import unicodedata


def to_ascii(input_str):  # change the utf-8 characters to ascii
    assert(type(input_str) == str)
    normalized = unicodedata.normalize("NFKD", input_str)
    normalized = normalized.replace('"', "'")  # always use ' instead of " to avoid error in json
    ascii_str = "".join(c for c in normalized if c.isascii())
    return ascii_str


random.seed(19260817)
parser = argparse.ArgumentParser()
parser.add_argument("--topic", type=str, default="")
parser.add_argument("--max_count", type=int, default=10**9)
parser.add_argument("--input_paper", type=str, default=working_dir+"ICLR2023paper_raw.json")
parser.add_argument("--input_review", type=str, default=working_dir+"ICLR2023review_raw.json")
parser.add_argument("--dataset_name", type=str, required=True)
args = parser.parse_args(["--max_count","2000","--dataset_name","rand2000"])

# read businesses in full dataset, and then rename "uid" key to "id"
with open(args.input_paper, mode="r", encoding="utf-8") as json_file:
    papers = json.load(json_file)
review_cnt = {}
for paper in papers:
    paper["id"] = paper["uid"]  # rename "uid" -> "id"
    paper.pop("uid", None)
    paper["title"] = to_ascii(str(paper["title"]))
    paper["keywords"] = to_ascii(str(paper["keywords"]))
    paper["abstract"] = to_ascii(str(paper["abstract"]))
    review_cnt[paper["id"]] = 0

# read reviews in full dataset, and then rename "text" key to "review" and rename "business_id" key to "belong_id"
with open(args.input_review, mode="r", encoding="utf-8") as json_file:
    reviews = json.load(json_file)

review_format_str = '''Summary Of The Paper:

{}

Strength And Weaknesses:

{}

Clarity, Quality, Novelty And Reproducibility:

{}

Summary Of The Review:

{}
'''

for review in reviews:
    review["belong_id"] = review["paper_uid"]  # rename "business_id" -> "belong_id"
    review.pop("paper_uid", None)
    review["review"] = review_format_str.format(review["summary_of_the_paper"],review["strength_and_weaknesses"],review["clarity,_quality,_novelty_and_reproducibility"],review["summary_of_the_review"])
    review_cnt[review["belong_id"]] += 1

paper_selected = []
has = {}
for paper in papers:

    def contain_str(source, targets):
        # return whether any element in targets is a substring of source
        ## target is seperated by comma, e.g. "Food,Restaurants"
        ## use underline to represent space
        _source = source.lower()
        _targets = targets.lower().replace(" ", "").replace("_", " ").split(",")
        for _target in _targets:
            if _target in _source:
                return True
        return False

    def legal(paper):
        if review_cnt[paper["id"]] < 3: return False
        return contain_str(paper["title"] + paper["keywords"], args.topic)

    if legal(paper):

        paper_selected.append(paper)
        has[paper["id"]] = []

random.shuffle(paper_selected)
if len(paper_selected) > args.max_count:
    paper_selected = paper_selected[:args.max_count]

print("# items =", len(paper_selected))
with open(working_dir+"paper_"+args.dataset_name+".json","w",encoding="utf-8") as json_file:
    json.dump(paper_selected, json_file)

for i, review in enumerate(reviews):
    if review["belong_id"] in has:
        has[review["belong_id"]].append(i)
review_selected = []
for paper in paper_selected:
    for i in has[paper["id"]]:
        reviews[i]["review"] = to_ascii(reviews[i]["review"])
        review_selected.append(reviews[i])

print("# reviews =", len(review_selected))
with open(working_dir+"review_"+args.dataset_name+".json","w",encoding="utf-8") as json_file:
    json.dump(review_selected, json_file)


# Dataset Refinement


In [None]:
# @title Download Papers
import argparse
import threading
import time
import sys
import pandas as pd
import json
import os
import requests
import json
from tqdm.notebook import tqdm

parser = argparse.ArgumentParser()
# which context, see folder '../openreview/' for openreview and '../yelp/' for yelp
parser.add_argument("--context", type=str, default="openreview")
# which dataset, use 'rl' for openreview and 'pizza' for yelp.
parser.add_argument("--dataset", type=str, required=True)
args = parser.parse_args(["--dataset", "rand2000"])

with open(working_dir+"paper_"+args.dataset+".json", "r") as json_file:
    papers = json.load(json_file)
with open(working_dir+"review_"+args.dataset+".json", "r") as json_file:
    reviews = json.load(json_file)

def is_file_larger_than(path,lim): # 10 kb
    try:
        size = os.path.getsize(path)
        return size > lim
    except FileNotFoundError:
        return False

if __name__ == "__main__":

    for i, paper in enumerate(tqdm(papers, desc="Downloading Papers")):

        path_pdf = working_dir + "files/paper_pdf/" + paper["id"] + ".pdf"

        if not is_file_larger_than(path_pdf, 10*1024) :
            time.sleep(0.5)
            url = "https://openreview.net" + paper["pdf"]
            print(i, url)
            response = requests.get(url, proxies={"http": None, "https": None})

            if response.status_code == 200:
                with open(path_pdf, "wb") as file:
                    file.write(response.content)
            else:
                print("Failed to download the file. Status code:", response.status_code)
                time.sleep(2)
                response = requests.get(url, proxies={"http": None, "https": None})
                if response.status_code == 200:
                    with open(path_pdf, "wb") as file:
                        file.write(response.content)
                else:
                    print("Failed again:", response.status_code)
                    raise(Exception)

In [None]:
# @title Zip all the pdfs (to local, no need)
import zipfile
import os


path_folder = working_dir + "files/paper_pdf/"
path_zip = path_folder + "paper.zip"

file_paths = []
for root, _, files in os.walk(path_folder):
    for file in files:
        if file.endswith("zip"):
            print(file)
            raise(Exception)
        file_paths.append(path_folder + file)

with zipfile.ZipFile(path_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file in tqdm(file_paths, desc="Zipping files"):
        zipf.write(file)



In [None]:
# @title Parse Papers
import argparse
import threading
import time
import sys
import pandas as pd
import json
import os
import requests

import PyPDF2
import zipfile

parser = argparse.ArgumentParser()
# which context, see folder '../openreview/' for openreview and '../yelp/' for yelp
parser.add_argument("--context", type=str, default="openreview")
# which dataset, use 'rl' for openreview and 'pizza' for yelp.
parser.add_argument("--dataset", type=str, required=True)
args = parser.parse_args(["--dataset", "rand1000"])

with open(working_dir+"paper_"+args.dataset+".json", "r") as json_file:
    papers = json.load(json_file)
with open(working_dir+"review_"+args.dataset+".json", "r") as json_file:
    reviews = json.load(json_file)

if __name__ == "__main__":

    # DO NOT USE PYPDF TO PARSE PAPERS
    # USE SCIENCEBEAM INSTEAD
    # (This is because pypdf returns dummy texts, which will lead the raw paper to be too long)
    if False:

        for i, paper in enumerate(papers):

            if i>5: break

            path_pdf = working_dir + "files/paper_pdf/" + paper["id"] + ".pdf"
            path_raw = working_dir + "files/paper_raw/" + paper["id"] + ".txt"

            if not os.path.exists(path_pdf):
                print("error")
                exit(0)

            def parse_pdf(path_pdf):
                text = ""
                with open(path_pdf, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    num_pages = len(pdf_reader.pages)
                    for page_num in range(num_pages):
                        page = pdf_reader.pages[page_num]
                        text += page.extract_text()
                return text

            text = parse_pdf(path_pdf)
            with open(path_raw, "w") as file:
                file.write(text)

    else:
        path_zip = working_dir + "files/paper_raw.zip"
        path_folder = working_dir + "files/paper_raw/"
        with zipfile.ZipFile(path_zip, 'r') as zip_ref:
            zip_ref.extractall(path_folder)

In [None]:
# @title Build Refined Dataset
import json
import random
import argparse
import pandas as pd
from tqdm.notebook import tqdm

parser = argparse.ArgumentParser()
# which context, see folder '../openreview/' for openreview and '../yelp/' for yelp
parser.add_argument("--context", type=str, default="openreview")
parser.add_argument("--dataset", type=str, required=True)
args = parser.parse_args(["--dataset", "rand2000"])

with open(working_dir+"paper_"+args.dataset+".json", "r") as json_file:
    papers = json.load(json_file)
with open(working_dir+"review_"+args.dataset+".json", "r") as json_file:
    reviews = json.load(json_file)

mp = {}
for review in reviews:
    if review["belong_id"] not in mp:
        mp[review["belong_id"]] = []
    mp[review["belong_id"]].append(review["review"])

assert(len(mp)==2000)
cnt = 0
for (paper_id, reviews) in mp.items():
    assert(len(reviews) >= 3)
    random.seed(19260817 + cnt)
    cnt += 1
    random.shuffle(reviews)

df = pd.DataFrame(columns=["paper_id", "pdf_url", "abstract", "parsed_text","human_review1","human_review2","human_review3"])
for i, paper in enumerate(tqdm(papers, desc="Processing Papers")):
    pdf_url = "https://openreview.net" + paper["pdf"]
    with open(working_dir + "files/paper_raw/" + paper["id"] + ".txt", "r", encoding="utf-8") as f:
        parsed_text = f.read()
    df.loc[len(df)] = [paper["id"], pdf_url, paper["abstract"], parsed_text, mp[paper["id"]][0],mp[paper["id"]][1],mp[paper["id"]][2]]
# df.to_json(dataset_dir + 'dataset_paper.json', index=False)
df.to_parquet(dataset_dir + 'dataset_paper.parquet', index=False)

In [None]:
import pandas as pd
dataset_paper = pd.read_parquet(dataset_dir + "dataset_paper.parquet")

for i in range(len(dataset_paper)):
        row = dataset_paper.loc[i]
        if len(row["parsed_text"]) < 1000:
            print(i, row["paper_id"], len(row["parsed_text"]))