In [1]:
import sys,os
import time
import pandas as pd
import pickle

from tqdm.notebook import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


import warnings
warnings.filterwarnings('ignore')

In [2]:
base_url = 'https://motifcentral.org/fit/'
DRIVER = webdriver.Safari # TODO change to Chrome or Firefox based on local availability

data_keys = [
    'Pearson\'s r2 (k-mer)',
    'Pearson\'s r2 (affinity-binned)',
    'Maximum predicted bin enrichment',
    'Maximum observed bin enrichment',
    'Partial Pearson\'s r2',
    'Predicted partial max bin enrichment',
    'Observed partial max bin enrichment',
    'R2',
    'Enrichment',
    'R2, partial',
    'Enrichment, partial',
    ]

Next cell fills `ids` to use later.

In [7]:
import re 

try:
    browser = DRIVER()
    browser.get('https://motifcentral.org/home')

    el = WebDriverWait(browser, timeout=10).until(
        EC.text_to_be_present_in_element((By.TAG_NAME, 'body'), 'Complex')
    )

    num_pages = int(re.search('Number of Pages: [0-9]+', browser.page_source).group(0).split(' ')[-1])
    print('num_pages', int(num_pages))
    ids = []
    for page in tqdm(range(num_pages)):
        b1 = browser.find_elements(By.XPATH, '//span[@class = "badge badge-pill badge-success"]')
        b2 = browser.find_elements(By.XPATH, '//span[@class = "badge badge-pill badge-dark"]')
        b3 = browser.find_elements(By.XPATH, '//span[@class = "badge badge-pill badge-light ng-star-inserted"]')
        for a, b, c in zip(b1, b2, b3):
            ids.append((a.text, b.text, int(c.text[1:])))
        time.sleep(2)
        
        # click next page
        if page != num_pages - 1: # don't click next page on last page
            button = browser.find_element(By.XPATH, '//button[@aria-label = "Next page"]')
            button.click()


    browser.quit()
    print(f'Collected {len(ids)} ids')
except Exception as e:
    print(e)
    browser.quit()

num_pages 88


  0%|          | 0/88 [00:00<?, ?it/s]

Collected 880 ids


In [8]:
with open('probound_ids.pkl', 'wb+') as f:
    pickle.dump(ids, f)

Parse useful values from HTML page for each id

In [29]:
from tqdm import tqdm
browser = DRIVER() 

# id_results = dict() # (int, dict) dict
print(f'Total collected: {len(id_results)}')
for x, y, id in tqdm(ids):
    if id in id_results.keys():
        continue
    try:
        browser.get(base_url + str(id))

        el = WebDriverWait(browser, timeout=20).until(
            EC.text_to_be_present_in_element((By.ID, 'selexKMerPrediction'), 'Pearson')
        )

        rows = browser.find_elements(By.CLASS_NAME, "row")

        # collect rows containing the lines from data_keys
        result_rows = set()
        for key in data_keys:
            for r in rows:
                if key in r.text and '_' not in r.text: 
                    if key == 'Predicted partial max bin enrichment':
                        # hacky fix because colon is missing from webpage
                        idx = r.text.index('nt') + 2
                        result_rows.add(r.text[:idx] + ':' + r.text[idx:])
                    else:
                        result_rows.add(r.text)

        results = dict() # (str, str) dict
        for row in result_rows:
            k, v = row.split(':')
            results[k] = v.strip()

        id_results[id] = results
        time.sleep(2)

        with open('../probound_id_results.pkl', 'wb') as f:
            pickle.dump(id_results, f)
    except Exception as e:
        print(e)
        browser.quit()
print(f'Total collected: {len(id_results)}')
        

Total collected: 680


100%|██████████| 880/880 [00:00<00:00, 1910449.03it/s]

Total collected: 680





## Analysis

In [10]:
with open('../probound_ids.pkl', 'rb+') as f:
    ids = pickle.load(f)
    print(f'Loaded {len(ids)} ids')

with open('../probound_id_results.pkl', 'rb+') as f:
    id_results = pickle.load(f)
    print(f'Loaded {len(id_results)} results')

Loaded 880 ids
Loaded 680 results


In [17]:
ids_df = pd.DataFrame(ids, columns=['study', 'exp', 'id'])

In [20]:
ids_df

Unnamed: 0,study,exp,id
0,Nitta2015,abd-A_KU_TCACTT40NTTG,15412
1,Slattery2011,AbdB.16mer2_rep1,11232
2,Jolma2013,SPIB_ESAK_TGTCTA20NTCG,16273
3,Nitta2015,Aef1_KY_TCGAGT40NACT,16388
4,Nitta2015,al_KY_TTTCAA40NTGA,15421
...,...,...,...
875,Yin2017,ARNTL_eDBD_KV_TCTCCG40NCG...,17963
876,Jolma2013,Arx_ESAC_TCGCAT20NACT,13720
877,Yin2017,ARX_eDBD_KO_TAGCGC40NCGC-...,16638
878,Yin2017,ASCL1_eDBD_KR_TGCGCA40NCA...,17025
