In [4]:
import requests
from bs4 import BeautifulSoup
import html2text
import json
import pickle
import multiprocessing as mp
from tqdm import tqdm

import mygene
mg = mygene.MyGeneInfo()

import scanpy as sc

In [2]:
pwd

'/mnt/ufs18/home-230/pandavis/cellular_resilience/CellResilienceModel/codebase/notebooks/genePT'

In [7]:
adata = sc.read_10x_h5('../../../data/BreastCancer10xGenomics_Rep1/Xenium_FFPE_Human_Breast_Cancer_Rep1_cell_feature_matrix.h5')
adata

AnnData object with n_obs × n_vars = 167782 × 313
    var: 'gene_ids', 'feature_types', 'genome'

In [8]:
gene_list = adata.var.index.tolist()

In [9]:
parts_to_remove = [
    "##  Summary\n",
    "NEW",
    'Try the newGene table',
    'Try the newTranscript table',
    '**',
    "\nGo to the top of the page Help\n"
]

In [10]:
def extract_gene_text(gene_number):
    
    url = f"https://www.ncbi.nlm.nih.gov/gene/{gene_number}"

    summary_text = ''
    soup = None
    try:
        response = requests.get(url, timeout=30)

    except requests.exceptions.Timeout:

        print('time out')

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        summary_tab = soup.find('div', {'class': 'rprt-section gene-summary'})

        if summary_tab:
            html_to_text = html2text.HTML2Text()
            html_to_text.ignore_links = True
            summary_text = html_to_text.handle(str(summary_tab))
            for part in parts_to_remove:
                summary_text = summary_text.replace(part, ' ')

            summary_text = summary_text.replace('\n', ' ')
            summary_text = ' '.join(summary_text.split())

        else:
            print("Summary tab not found on the page.")
    else:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
            
    return((summary_text,soup))

In [11]:
gene_list_results = mg.querymany(sorted(gene_list), scopes='symbol', species='human')

7 input query terms found no hit:	['CLECL1', 'FAM49A', 'KARS', 'LARS', 'NARS', 'QARS', 'WARS']


In [12]:
gene_name_to_tax_id = {}
for result in gene_list_results:
    if "_id" in result and "query" in result:
        gene_name_to_tax_id[result['symbol']] = result['_id']

In [13]:
gene_name_to_summary_page = {}

def process_gene_ids(args):
    gene_name, page_id = args
    parsed_text, _ = extract_gene_text(page_id)
    return gene_name, parsed_text

args_list = [(gene_name, page_id) for gene_name, page_id in gene_name_to_tax_id.items()]

with mp.Pool() as pool:
    results = list(tqdm(pool.imap(process_gene_ids, args_list), total=len(args_list)))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 306/306 [00:29<00:00, 10.51it/s]


In [14]:
for gene_name, parsed_text in results:
    gene_name_to_summary_page[gene_name] = parsed_text

In [15]:
with open('../../../data/BreastCancer10xGenomics_Rep1/exported_data/gene_data_from_ncbi_genept.json', 'w') as file:
    json.dump(gene_name_to_summary_page, file)