show attributes counts for both human and mouse

In [1]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import re
import json
from tqdm import tqdm

In [2]:
def import_GSE():
    GREIN_data = pd.read_csv("data/GREIN_data.csv")
    GREIN_data = GREIN_data[GREIN_data.Species != 'Rattus norvegicus'] #drop brown rat
    GREIN_human = GREIN_data[GREIN_data.Species == 'Homo sapiens']
    GREIN_mouse = GREIN_data[GREIN_data.Species == 'Mus musculus']
    
    GSE_human = GREIN_human['GEO accession'].tolist()
    GSE_mouse = GREIN_mouse['GEO accession'].tolist()
    
    return GSE_human, GSE_mouse

In [3]:
def get_url(geo_id):
    return f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={geo_id}"

In [4]:
def fetch_page(url):
    try:
        response = requests.get(url)
        return response.text if response.status_code == 200 else f"Failed to retrieve the page. Status code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"


In [5]:
def scrape_geo_data(geo_id):
    url = get_url(geo_id)
    page_content = fetch_page(url)
    
    if not isinstance(page_content, str):
        return page_content

    soup = BeautifulSoup(page_content, 'html.parser')
    gsm_links = soup.find_all('a', href=lambda href: href and href.startswith('/geo/query/acc.cgi?acc=GSM'))
    gsm_values = [link.text for link in gsm_links]

    # Store GSM values with their corresponding GSE (geo_id) in a dictionary
    gse_gsm_dict = {geo_id: gsm_values}

    return gse_gsm_dict


In [6]:
def scrape_characteristics(geo_id):
    url = get_url(geo_id)
    page_content = fetch_page(url)
    
    if not isinstance(page_content, str):
        return page_content

    soup = BeautifulSoup(page_content, 'html.parser')
    characteristics_label = soup.find('td', text='Characteristics')

    if characteristics_label:
        characteristics_content = characteristics_label.find_next_sibling('td')
        return str(characteristics_content)

    return f"Failed to find Characteristics for {geo_id}"

In [7]:
def extract_characteristics(input_str):
    input_str = re.sub(r'<td[^>]*>', '', input_str)
    pattern = r'(\w+): ([^<]+)'
    matches = re.findall(pattern, input_str)
    
    characteristics_dictionary = dict(matches)
    
    return characteristics_dictionary

In [8]:
def process_gsm(gsm):
    characteristics_string = scrape_characteristics(gsm)
    characteristics_dictionary = extract_characteristics(characteristics_string)
    return gsm, characteristics_dictionary

In [9]:
def main():
    GSEs_human = import_GSE()[0]

    GSM_human = {}
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Use executor.map to parallelize the execution of scrape_geo_data
#         gse_gsm_dicts = list(tqdm(executor.map(scrape_geo_data, GSEs_human), total=len(GSEs_human), desc="Scraping GEO Data"))
        
        gse_gsm_dicts = list(tqdm(executor.map(scrape_geo_data, GSEs_human[:10]), total=len(GSEs_human[:10]), desc="Scraping GEO Data"))
        
        for gse_gsm_dict in gse_gsm_dicts:
            GSM_human.update(gse_gsm_dict)

    results = {}
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Use executor.map to parallelize the execution of process_gsm
        gsm_characteristics = list(tqdm(executor.map(process_gsm, (gsm for gsm_list in GSM_human.values() for gsm in gsm_list)), total=len(GSM_human), desc="Processing GSM Characteristics"))

        for gsm, characteristics_dictionary in gsm_characteristics:
            results[gsm] = characteristics_dictionary

    characteristics_dict = {}
    for gsm_id, characteristics in results.items():
        for gse_id, gsm_list in GSM_human.items():
            if gsm_id in gsm_list:
                if gse_id not in characteristics_dict:
                    characteristics_dict[gse_id] = {}
                characteristics_dict[gse_id][gsm_id] = characteristics

    # Store the dictionary in a JSON file
    with open('data/charateristics_human.json', 'w') as json_file:
        json.dump(characteristics_dict, json_file)


In [10]:
main()

Scraping GEO Data: 100%|██████████| 10/10 [00:00<00:00, 15.19it/s]
  characteristics_label = soup.find('td', text='Characteristics')
Processing GSM Characteristics: 325it [00:12, 25.35it/s]                      


In [11]:
def extract_label_content(soup, label_text):
    label = soup.find('td', text=label_text)
    if label:
        content = label.find_next_sibling('td').get_text()
    else:
        content = f"{label_text} not found on the page."
    return content

In [12]:
def scrape_metadata(geo_id):
    url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={geo_id}"

    try:
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            metadata = {
                'Title': extract_label_content(soup, 'Title'),
                'Experiment type': extract_label_content(soup, 'Experiment type'),
                'Organism': extract_label_content(soup, 'Organism'),
                'Summary': extract_label_content(soup, 'Summary'),
                'Overall design': extract_label_content(soup, 'Overall design'),
                'SRA': extract_label_content(soup, 'SRA'),
                'Samples': [] 
            }

           # Find all <a> tags with href attributes containing "GSM"
            gsm_links = soup.find_all('a', href=lambda href: href and href.startswith('/geo/query/acc.cgi?acc=GSM'))

            gsm_values = [link.text for link in gsm_links]

           
            metadata['Samples'] = gsm_values
            
            platforms_label = soup.find('td', text=re.compile(r'Platforms \(\d+\)'))
            if platforms_label:
                metadata['Platforms'] = platforms_label.find_next_sibling('td').get_text()
            else:
                metadata['Platforms'] = "Platforms not found on the page."

            return metadata

        else:
            return f"Failed to retrieve the page. Status code: {response.status_code}"

    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

In [13]:
def fetch_metadata_parallel(gse_values):
    metadata_dict = {}
    with concurrent.futures.ThreadPoolExecutor() as executor:
        metadata_list = list(tqdm(executor.map(scrape_metadata, gse_values), total=len(gse_values), desc="Fetching Metadata"))
        metadata_dict = dict(zip(gse_values, metadata_list))
    return metadata_dict

In [14]:
geo_id = "GSE100040"
metadata = scrape_metadata(geo_id)
print("metadata:", metadata)

metadata: {'Title': 'Human TFIIH kinase CDK7 regulates transcription-associated epigenetic modification', 'Experiment type': 'Expression profiling by high throughput sequencingGenome binding/occupancy profiling by high throughput sequencing', 'Organism': 'Homo sapiens', 'Summary': 'CDK7 phosphorylates the RNA polymerase II (pol II) CTD and activates the P-TEFb- associated kinase, CDK9, but its regulatory roles remain obscure. Using human CDK7 analog-sensitive (CDK7as) cells, we observed reduced capping enzyme recruitment, increased pol II promoter-proximal pausing, and defective termination at gene 3\'-ends upon CDK7 inhibition. We also found that CDK7 regulates chromatin modifications downstream of transcription start sites. H3K4me3 spreading was restricted at gene 5\'-ends and H3K36me3 was displaced toward gene 3\'-ends in CDK7as cells. Together, these results implicate a CDK7-dependent "CTD code" that regulates epigenetic marks in addition to RNA processing and pol II pausing.', 'Ov

  label = soup.find('td', text=label_text)
  platforms_label = soup.find('td', text=re.compile(r'Platforms \(\d+\)'))


In [15]:
GSEs_mouse = import_GSE()[1]

metadata_dict_mouse = fetch_metadata_parallel(GSEs_mouse)

# Save metadata as JSON file
with open('data/metadata_mouse.json', 'w') as json_file:
    json.dump(metadata_dict_mouse, json_file)

  label = soup.find('td', text=label_text)
  platforms_label = soup.find('td', text=re.compile(r'Platforms \(\d+\)'))
Fetching Metadata: 100%|██████████| 4066/4066 [04:53<00:00, 13.87it/s]


In [16]:
GSEs_human = import_GSE()[0]

metadata_dict = fetch_metadata_parallel(GSEs_human)

# Save metadata as JSON file
with open('data/metadata_human.json', 'w') as json_file:
    json.dump(metadata_dict, json_file)
        
# for gse_id, metadata in metadata_dict.items():
#     print(f"Metadata for {gse_id}:", metadata)

  label = soup.find('td', text=label_text)
  platforms_label = soup.find('td', text=re.compile(r'Platforms \(\d+\)'))
Fetching Metadata: 100%|██████████| 3395/3395 [06:55<00:00,  8.16it/s]  
