show attributes counts for both human and mouse, scale up to 100 GSEs

In [1]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import re
import json
from tqdm import tqdm
import torch

In [2]:
def import_GSE():
    GREIN_data = pd.read_csv("data/GREIN_data.csv")
    GREIN_data = GREIN_data[GREIN_data.Species != 'Rattus norvegicus'] #drop brown rat
    GREIN_human = GREIN_data[GREIN_data.Species == 'Homo sapiens']
    GREIN_mouse = GREIN_data[GREIN_data.Species == 'Mus musculus']
    
    GSE_human = GREIN_human['GEO accession'].tolist()
    GSE_mouse = GREIN_mouse['GEO accession'].tolist()
    
    return GSE_human, GSE_mouse

In [3]:
def get_url(geo_id):
    return f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={geo_id}"

In [4]:
def fetch_page(url):
    try:
        response = requests.get(url)
        return response.text if response.status_code == 200 else f"Failed to retrieve the page. Status code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"


In [5]:
def scrape_geo_data(geo_id):
    url = get_url(geo_id)
    page_content = fetch_page(url)
    
    if not isinstance(page_content, str):
        return page_content

    soup = BeautifulSoup(page_content, 'html.parser')
    gsm_links = soup.find_all('a', href=lambda href: href and href.startswith('/geo/query/acc.cgi?acc=GSM'))
    gsm_values = [link.text for link in gsm_links]

    # Store GSM values with their corresponding GSE (geo_id) in a dictionary
    gse_gsm_dict = {geo_id: gsm_values}

    return gse_gsm_dict


In [6]:
def scrape_characteristics(geo_id):
    url = get_url(geo_id)
    page_content = fetch_page(url)
    
    if not isinstance(page_content, str):
        return page_content

    soup = BeautifulSoup(page_content, 'html.parser')
    characteristics_label = soup.find('td', text='Characteristics')

    if characteristics_label:
        characteristics_content = characteristics_label.find_next_sibling('td')
        return str(characteristics_content)

    return f"Failed to find Characteristics for {geo_id}"

In [7]:
def extract_characteristics(input_str):
    input_str = re.sub(r'<td[^>]*>', '', input_str)
    pattern = r'(\w+): ([^<]+)'
    matches = re.findall(pattern, input_str)
    
    characteristics_dictionary = dict(matches)
    
    return characteristics_dictionary

In [8]:
def process_gsm(gsm):
    characteristics_string = scrape_characteristics(gsm)
    characteristics_dictionary = extract_characteristics(characteristics_string)
    return gsm, characteristics_dictionary

In [9]:
def main():
    # Specify the device (CPU or GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    GSEs_human = import_GSE()[0]

    GSM_human = {}
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Use executor.map to parallelize the execution of scrape_geo_data
#         gse_gsm_dicts = list(tqdm(executor.map(scrape_geo_data, GSEs_human), total=len(GSEs_human), desc="Scraping GEO Data"))
        
        gse_gsm_dicts = list(tqdm(executor.map(scrape_geo_data, GSEs_human[:100]), total=len(GSEs_human[:100]), desc="Scraping GEO Data"))
        
        for gse_gsm_dict in gse_gsm_dicts:
            GSM_human.update(gse_gsm_dict)

    results = {}
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Use executor.map to parallelize the execution of process_gsm
        gsm_characteristics = list(tqdm(executor.map(process_gsm, (gsm for gsm_list in GSM_human.values() for gsm in gsm_list)), total=len(GSM_human), desc="Processing GSM Characteristics"))

        for gsm, characteristics_dictionary in gsm_characteristics:
            results[gsm] = characteristics_dictionary

    characteristics_dict = {}
    for gsm_id, characteristics in results.items():
        for gse_id, gsm_list in GSM_human.items():
            if gsm_id in gsm_list:
                if gse_id not in characteristics_dict:
                    characteristics_dict[gse_id] = {}
                characteristics_dict[gse_id][gsm_id] = characteristics

    # Store the dictionary in a JSON file
    with open('data/charateristics_human.json', 'w') as json_file:
        json.dump(characteristics_dict, json_file)


In [10]:
main()

Scraping GEO Data: 100%|██████████| 100/100 [00:12<00:00,  7.92it/s]
  characteristics_label = soup.find('td', text='Characteristics')
Processing GSM Characteristics: 2446it [03:27, 11.77it/s]                       
