In [None]:
import sys,os
import pandas as pd

from tqdm.notebook import tqdm_notebook as tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


import warnings
warnings.filterwarnings('ignore')

In [None]:
base_url = 'https://motifcentral.org/fit/'
DRIVER = webdriver.Safari # change to Chrome or Firefox based on local availability

data_keys = [
    'Pearson\'s r2 (k-mer)',
    'Pearson\'s r2 (affinity-binned)',
    'Maximum predicted bin enrichment',
    'Maximum observed bin enrichment',
    'Partial Pearson\'s r2',
    'Predicted partial max bin enrichment',
    'Observed partial max bin enrichment',
    'R2',
    'Enrichment',
    'R2, partial',
    'Enrichment, partial',
    ]

Next cell fills `ids` to use later.

In [None]:
import re 

try:
    browser = DRIVER()
    browser.get('https://motifcentral.org/home')

    el = WebDriverWait(browser, timeout=10).until(
        EC.text_to_be_present_in_element((By.TAG_NAME, 'body'), 'Complex')
    )

    # write entire source to html file
    file_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'home_source.html')
    with open(file_path, 'w') as f:
        f.write(browser.page_source)

    num_pages = int(re.search('Number of Pages: [0-9]+', browser.page_source).group(0).split(' ')[-1])
    print('num_pages', int(num_pages))
    ids = []
    for page in tqdm(range(num_pages)):
        # get ids from page
        m = re.findall('ID: [0-9]*; Score', browser.page_source)
        id_set = set([int(x.split(' ')[1][:-1]) for x in m])
        ids.extend(id_set)

        # click next page
        if page != num_pages - 1: # don't click next page on last page
            button = browser.find_element(By.XPATH, '//button[@aria-label = "Next page"]')
            button.click()

    browser.quit()
    print(f'Collected {len(ids)} ids')
except Exception as e:
    print(e)
    browser.quit()

Parse useful values from HTML page for each id

In [None]:
browser = DRIVER() 

id_results = dict() # (int, dict) dict
for id in tqdm(ids):
    try:
        browser.get(base_url + str(id))

        el = WebDriverWait(browser, timeout=10).until(
            EC.text_to_be_present_in_element((By.ID, 'selexKMerPrediction'), 'Pearson')
        )

        rows = browser.find_elements(By.CLASS_NAME, "row")

        # collect rows containing the lines from data_keys
        result_rows = set()
        for key in data_keys:
            for r in rows:
                if key in r.text and '_' not in r.text: 
                    if key == 'Predicted partial max bin enrichment':
                        # hacky fix because colon is missing from webpage
                        idx = r.text.index('nt') + 2
                        result_rows.add(r.text[:idx] + ':' + r.text[idx:])
                    else:
                        result_rows.add(r.text)

        results = dict() # (str, str) dict
        for row in result_rows:
            k, v = row.split(':')
            results[k] = v.strip()

        id_results[id] = results
    except Exception as e:
        print(e)
        browser.quit()
        