In [2]:
from openreview.api import OpenReviewClient
from pathlib import Path
import pandas as pd

venue_id = "NeurIPS.cc/2024/Conference"

if Path("submissions.pkl").exists():
    submissions = pd.read_pickle("submissions.pkl")
else:
    client = OpenReviewClient(baseurl="https://api2.openreview.net")
    submissions = client.get_all_notes(content={"venueid": venue_id})
    pd.to_pickle(submissions, "submissions.pkl")

In [5]:
from collections import Counter

print(len(submissions))

# reject は含まれていない
print(Counter([item.content["venue"]["value"] for item in submissions]))

4035
Counter({'NeurIPS 2024 poster': 3648, 'NeurIPS 2024 spotlight': 326, 'NeurIPS 2024 oral': 61})


In [None]:
from pprint import pprint


pprint(Counter([item.content["primary_area"]["value"] for item in submissions]))
# natural_language_processing, reinforcement_learning, learning_theory

Counter({'machine_vision': 579,
         'natural_language_processing': 300,
         'reinforcement_learning': 276,
         'learning_theory': 251,
         'diffusion_based_models': 221,
         'generative_models': 204,
         'optimization': 189,
         'deep_learning_architectures': 186,
         'safety_in_machine_learning': 180,
         'other': 147,
         'machine_learning_for_other_sciences_and_fields': 145,
         'probabilistic_methods': 125,
         'optimization_for_deep_networks': 123,
         'graph_neural_networks': 121,
         'interpretability_and_explainability': 121,
         'neuroscience_and_cognitive_science': 107,
         'machine_learning_for_physical_sciences': 87,
         'privacy': 84,
         'causal_inference': 79,
         'machine_learning_for_healthcare': 75,
         'online_learning': 62,
         'bandits': 61,
         'algorithmic_game_theory': 54,
         'fairness': 47,
         'evaluation': 46,
         'robotics': 43,
     

In [7]:
import yaml

config = yaml.safe_load(Path("config.yaml").read_text())
config["include"]

['natural_language_processing',
 'reinforcement_learning',
 'diffusion_based_models',
 'deep_learning_architectures',
 'interpretability_and_explainability',
 'neuroscience_and_cognitive_science',
 'privacy',
 'causal_inference',
 'machine_learning_for_healthcare',
 'online_learning',
 'bandits',
 'algorithmic_game_theory',
 'fairness',
 'evaluation',
 'robotics',
 'speech_and_audio',
 'infrastructure',
 'active_learning',
 'human-AI_interaction',
 'machine_learning_for_social_sciences',
 'other']

In [None]:
from tqdm import tqdm
import yaml

config = yaml.safe_load(Path("config.yaml").read_text())
include_set = set(config["include"])

template = Path("template.md").read_text()
full_text = ""
for item in tqdm(submissions):
    if item.content["primary_area"]["value"] not in include_set:
        continue

    full_text += template.format(
        title=item.content["title"]["value"],
        tldr=item.content.get("TLDR", {}).get("value", ""),
        authors=", ".join(item.content["authors"]["value"]),
        venue=item.content["venue"]["value"],
        abstract=item.content["abstract"]["value"],
        primary_area=item.content["primary_area"]["value"],
        keywords=", ".join(item.content["keywords"]["value"]),
        id=item.id,
    )

100%|██████████| 4035/4035 [00:07<00:00, 524.02it/s] 


In [14]:
Path("submissions.md").write_text(full_text)

6721286