In [2]:
%load_ext autoreload
%autoreload 2

import re
import os
import json
import time
import sqlite3
import datetime
import operator
import numpy as np
from tqdm.notebook import tqdm, trange

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from funcs import MetaData
# from src import DataBase

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
class DataBase:
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.database = None
        self.cursor = None
        

    def initialize(self, create: bool = False):
        self.database = sqlite3.connect(self.db_path, check_same_thread=False)
        self.cursor = self.database.cursor()
  
        if create:
            _cmd = f"CREATE TABLE submissions " \
                   f"(id int, url text, title text, keywords text, " \
                   f"rating_0_cnt int, rating_0_avg float, rating_0_std float, ratings_0 text," \
                   f"rating_1_cnt int, rating_1_avg float, rating_1_std float, ratings_1 text," \
                   f"deision text )"
            self.cursor.execute(_cmd)
    
    def write_item_0(self, 
                     _id: int, url: str, # title: str, 
#                      keywords: str, 
                     ratings: list,
                    ):
#         title = title.replace('\\', '').replace("\"", "'")
        num_rating = len(ratings)
        rating_avg = np.mean(ratings).item()
        rating_std = np.std(ratings).item()
        ratings = ', '.join(map(str, ratings))
        _cmd = f"insert into submissions ('id', 'url', 'rating_0_cnt', " \
               f"'rating_0_avg', 'rating_0_std', 'ratings_0') values ( " \
               f"'{_id}', \"{url}\", " \
               f"'{num_rating}', \"{rating_avg}\", \"{rating_std}\", " \
               f"'{ratings}' " \
               f" )"
        print(_cmd)
        self.cursor.execute(_cmd)
        self.database.commit()
        

    def close(self):
        self.cursor.close()
        self.database.close()

In [31]:
db = DataBase('assets/iclr2022.db')
db.initialize(create=True)

In [34]:
db.close()

## Parse each item

In [None]:
s = Service('/opt/homebrew/bin/chromedriver')
op = Options()
op.add_argument('headless')
driver = webdriver.Chrome(service=s, options=op)

In [None]:
url_list = list(url_dict.keys())
for i in trange(num_items):
    url = url_list[i].strip()
    item_id = url.split('id=')[-1]
    cat = url_dict[url_list[i]]
    driver.get(url)
    time.sleep(1)
    loaded = False
    num_try = 0
    while not loaded:        
        # comments
        _comments = driver.find_element(By.ID, 'note_children')
        comments = _comments.find_elements(By.XPATH, "div[@class='note_with_children comment-level-odd']")
        
        if len(comments) > 0:
            loaded = True
        else:
            time.sleep(.5)
            assert num_try < 10000, f'Failed to load {url} | tried: {num_try}'
            num_try += 1
    
    item = driver.find_element(By.ID, f'note_{item_id}')

    # title
    title_el = item.find_element(
        By.CSS_SELECTOR, "div[class='title_pdf_row clearfix']"
    ).find_element(By.CLASS_NAME, 'note_content_title').find_element(By.TAG_NAME, 'a')
    item_title = title_el.text.strip()

    # authors
    author_el = item.find_element(
        By.CSS_SELECTOR, "div[class='meta_row']"
    ).find_element(By.TAG_NAME, 'span').find_elements(By.TAG_NAME, 'a')
    item_authors = ", ".join([a.text for a in author_el])

    # keywords
    key_el = item.find_element(By.CLASS_NAME, 'note_contents').find_elements(By.TAG_NAME, 'span')
    assert key_el[0].text == 'Keywords:', f'{url}: Keywords not found!'
    item_keywords = key_el[1].text
    
    # comments
    _comments = driver.find_element(By.ID, 'note_children')
    comments = _comments.find_elements(By.XPATH, "div[@class='note_with_children comment-level-odd']")
    
    item_ratings = []
    two_decision = False
    for comment in comments:
        keys = comment.find_elements(By.CLASS_NAME, 'note_content_field')
        values = comment.find_elements(By.CLASS_NAME, 'note_content_value')
        assert len(keys) == len(values), 'key not match with value for {url}'
        
        # paper decision box
        if 'Decision:' in [k.text for k in keys]:
            for _k in range(len(keys)):
                if keys[_k].text == 'Decision:':  # decesion
                    item_final_decision = values[_k].text
                    _item_final_decision = item_final_decision.split(' ')[-1].strip('(').strip(')')
                    assert _item_final_decision.lower() == cat, f'final decision not match for {url}'
                if keys[_k].text == 'Consistency Experiment:':  # 
                    two_decision = True
                    _value = values[_k].text
                    _value_re = re.findall(r'.*This copy’s committee reached the following decision: (.*)', _value)
                    if len(_value_re) > 0:  
                        item_decision = _value_re[0]
                    else:  # both 
                        _value_re = re.findall(r'.*Both committees reached the same decision: (.*)', _value)
                        item_decision = _value_re[0]
                        
        # comemnt box
        elif 'Rating:' in [k.text for k in keys]:
            for _k in range(len(keys)):
                if keys[_k].text == 'Rating:':
                    _rating = int(values[_k].text.split(':')[0])
                    item_ratings.append(_rating)
    if two_decision:
        num_decision = 2
    else:
        num_decision = 1
        item_decision = item_final_decision
    # print(i, url, item_title, item_keywords, item_authors, item_final_decision, item_decision, item_ratings)
    db.write_item(i, url, item_title, item_keywords, item_authors, num_decision, item_final_decision, item_decision, item_ratings)

        

In [None]:
db.close()

In [None]:
driver.quit()