In [1]:
%matplotlib inline

import os
import json
import time
import operator
import numpy as np
from tqdm.notebook import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Get submission list

In [None]:
def get_item_url(page_id, driver, url_list):
    # find all items in current page
    # the first element is all submissions, the second is rejected/withdrawn ones
    item_list_parent = driver.find_elements(By.CSS_SELECTOR, "ul[class='list-unstyled submissions-list']")[0]
    item_list = item_list_parent.find_elements(By.CLASS_NAME, 'note')
    item_list_len = len(item_list)
    print(f'processing page {page_id} | {item_list_len} items | total: {len(url_list)} items')
    for i in tqdm(range(item_list_len)):
        # the fist <a> is the paper title and url
        item = item_list[i].find_elements(By.TAG_NAME, 'a')[0] 
        url_list.append(item.get_attribute('href').strip()) 

In [5]:
s = Service(r'./chromedriver_win32/chromedriver.exe')
driver = webdriver.Chrome(service=s)
driver.get('https://openreview.net/group?id=ICLR.cc/2022/Conference')

In [None]:
url_list = []
page_id = 0
is_not_end = True
while(page_id >= 0):
    if page_id == 0:
        # process current page
        get_item_url(page_id, driver, url_list)
    else:
        if is_not_end:
            try:
                # jump to next page
                next_page_btns[0].find_element(By.TAG_NAME, 'a').click()
                time.sleep(2.5)
                # process current page
                get_item_url(page_id, driver, url_list)
            except:
                print(f'Failed to jump to page {page_id}')
        else:
            break
    next_page_btns = driver.find_elements(By.CSS_SELECTOR, "li[class='  right-arrow']")
    is_not_end = len(next_page_btns) == 4
    page_id += 1

In [None]:
# save url list
with open('assets/url_list.txt', 'w') as f:
    f.write(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) + '\n')    
    f.write('\n'.join(url_list))

## Parse each item

In [2]:
# submission meta data
class AllSubmissions:
    def __init__(self, save_root: str):
        self.items = []
        self.save_root = save_root
    
    def update(self, index: int, url: str, title: str, keywords: list, scores: list, avg_score: float = -1.):
        item = {}
        if len(scores) > 0 and avg_score == -1.:
            avg_score = np.mean(scores)    
        item.update({
            'url': url,
            'title': title,
            'keywords': keywords,
            'scores': scores,
            'avg_score': avg_score
        })
        self.items.append(item)
        with open(os.path.join(self.save_root, f'{index}.txt'), 'w') as f:
            json.dump(item, f)
    
    def get_all_values_by_key(self, key: str = ''):
        return list(map(operator.itemgetter(key), self.items))
    
    def save(self, path: str):
        with open(path, 'w') as f:
            for item in self.items:
                f.write(json.dumps(item) + '\n')
        print(f'Saved to {path}')
    
    def __len__(self):
        return len(self.items)

In [3]:
# read url list
with open('assets/url_list.txt', 'r') as f:
    url_list = f.readlines()
data_time = url_list[0]
item_list = url_list[1:]
num_items = len(item_list)
print(f'Total {num_items} items | time: {data_time}')

Total 3328 items | time: 2021-11-09 17:11:59



In [7]:
all_submissions = AllSubmissions('assets/data/')
for i in tqdm(range(num_items)):
    item_url = item_list[i].strip()
#     print(item_url)
    driver.get(item_url)
    time.sleep(1)
    loaded = False
    num_try = 0
    while not loaded:
        comment_list = driver.find_elements(By.CSS_SELECTOR, "div[class='note_with_children comment-level-odd']")
        num_comment = len(comment_list)
        if num_comment > 0:
            loaded = True
        else:
            time.sleep(.5)
            if num_try > 1000:
                print(f'Failed to load {item_url} with max tries!')
            num_try += 1
    # process comments
    item_scores = []
    for comment in comment_list:
        _comment = comment.find_elements(By.CLASS_NAME, 'meta_row')[0].find_elements(By.TAG_NAME, 'span')[0]
        if 'ICLR' not in _comment.get_attribute('innerHTML'):
            continue
        if 'Reviewer' not in _comment.get_attribute('innerHTML'):
            continue
        _comment = comment.find_elements(By.CSS_SELECTOR, "div[class='note panel']")[0]
        _comment = _comment.find_elements(By.CLASS_NAME, 'note_contents')
        _comment = _comment[-2]
        recommend = _comment.find_elements(By.TAG_NAME, 'span')
        if recommend[0].get_attribute('innerHTML') == 'Recommendation: ':
            _score = float(recommend[1].get_attribute('innerHTML').split(':')[0])
            item_scores.append(_score)
    # process title
    _title = driver.find_elements(By.CLASS_NAME, 'note_content_title')[0].find_elements(By.TAG_NAME, 'a')[0]
    item_title = _title.get_attribute('innerHTML').strip()
    # process keywords
    _keywords = driver.find_elements(By.CLASS_NAME, 'note_contents')[0]
    if 'Keywords:' in _keywords.find_elements(By.TAG_NAME, 'span')[0].get_attribute('innerHTML'):
        item_keywords = _keywords.find_elements(By.TAG_NAME, 'span')[1].get_attribute('innerHTML').strip().split(',')
        item_keywords = [_k.strip() for _k in item_keywords]
    else:
        item_keywords = []
    all_submissions.update(i, item_url, item_title, item_keywords, item_scores)

  0%|          | 0/1 [00:00<?, ?it/s]

https://openreview.net/forum?id=B9LUI0pZFGc
