In [9]:
import requests
from bs4 import BeautifulSoup
import html2text
import mygene
import json
import pickle
import os
mg = mygene.MyGeneInfo()

os.chdir("C:\OneDrive\OneDrive - UW-Madison\Kris\Code\GenePT")

In [5]:
parts_to_remove = [
    "##  Summary\n",
    "NEW",
    'Try the newGene table',
    'Try the newTranscript table',
    '**',
    "\nGo to the top of the page Help\n"
]

def rough_text_from_gene_name(gene_number):
    
    # get url
    url = f"https://www.ncbi.nlm.nih.gov/gene/{gene_number}"
    # Send a GET request to the URL
    summary_text = ''
    soup = None
    try:
        response = requests.get(url, timeout=30)
    except requests.exceptions.Timeout:
        print('time out')
        return((summary_text,soup))
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the "summary" tab content by inspecting the page's structure
        summary_tab = soup.find('div', {'class': 'rprt-section gene-summary'})

        # Check if the "summary" tab content is found
        if summary_tab:
            # Convert the HTML to plain text using html2text
            html_to_text = html2text.HTML2Text()
            html_to_text.ignore_links = True  # Ignore hyperlinks

            # Extract the plain text from the "summary" tab
            summary_text = html_to_text.handle(str(summary_tab))
            # Remove the specified parts from the original text
            for part in parts_to_remove:
                summary_text = summary_text.replace(part, ' ')
                # Replace '\n' with a space
            summary_text = summary_text.replace('\n', ' ')

            # Reduce multiple spaces into one space
            summary_text = ' '.join(summary_text.split())
            # Print or save the extracted text
        else:
            print("Summary tab not found on the page.")
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
    return((summary_text,soup))

In [6]:
# Using Gene CD24 as an example
cd_24_name = mg.querymany('CD24', scopes='symbol', species='human')

In [7]:
gene_name_to_tax_id = {}
for result in cd_24_name:
    if "_id" in result and "query" in result:
        gene_name_to_tax_id[result['symbol']] = result['_id']

In [8]:
gene_name_to_tax_id

{'CD24': '100133941'}

In [16]:
# with open('./data/vocab.json', 'rb') as handle:
#     vocab_gene = json.load(handle)
# vocab_gene_list = list(vocab_gene.keys())

In [12]:
gene_name_to_summary_page = {}

In [13]:
for gene_name, page_id in sorted(gene_name_to_tax_id.items()):
    if gene_name not in gene_name_to_summary_page:
        print('gene_name',gene_name)
        parsed_text, unparsed_html = rough_text_from_gene_name(page_id)
        gene_name_to_summary_page[gene_name] = parsed_text

gene_name CD24


In [14]:
gene_name_to_summary_page

{'CD24': 'Official Symbol CD24provided by HGNC Official Full Name CD24 moleculeprovided by HGNC Primary source HGNC:HGNC:1645 See related Ensembl:ENSG00000272398 MIM:600074; AllianceGenome:HGNC:1645 Gene type protein coding RefSeq status REVIEWED Organism Homo sapiens Lineage Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo Also known as CD24A Summary This gene encodes a sialoglycoprotein that is expressed on mature granulocytes and B cells and modulates growth and differentiation signals to these cells. The precursor protein is cleaved to a short 32 amino acid mature peptide which is anchored via a glycosyl phosphatidylinositol (GPI) link to the cell surface. This gene was missing from previous genome assemblies, but is properly located on chromosome 6. Non-transcribed pseudogenes have been designated on chromosomes 1, 15, 20, and Y. Alternative splicing results in multiple transc

### Experiment with Varied Gene Sets Tailored to Your Needs
#### For any specific gene name, mygene can be utilized to translate it into page IDs. We've illustrated an example using gene vocabularies in scGPT and Geneformer. Download links for these files are available in the repository's README.

In [19]:
# load genes used in GenePT
with open(f'./data/scGPT_vocab.json', 'rb') as handle:
    vocab_gene = json.load(handle)
vocab_gene_list = list(vocab_gene.keys())

# load genes used in Geneformer
with open(f"./data/geneformer_token_dictionary.pkl", 'rb') as handle:
    token_dictionary = pickle.load(handle)

# example query to convert gene IDs into page ids for NCBI 
vocab_gene_list_results = mg.querymany(sorted(vocab_gene_list), scopes='symbol', species='human')
token_dictionary_results = mg.querymany(sorted(token_dictionary.keys()), fields="symbol")

7094 input query terms found dup hits:	[('A2ML1-AS1', 2), ('A2ML1-AS2', 2), ('A2MP1', 2), ('AACSP1', 2), ('AADACL2-AS1', 3), ('AADACP1', 2)
21905 input query terms found no hit:	['5S_rRNA_ENSG00000276861', '5S_rRNA_ENSG00000277411', '5S_rRNA_ENSG00000277488', '5S_rRNA_ENSG00000
1 input query terms found dup hits:	[('ENSG00000268674', 3)]
3109 input query terms found no hit:	['<mask>', '<pad>', 'ENSG00000005955', 'ENSG00000006074', 'ENSG00000006075', 'ENSG00000006114', 'ENS


In [20]:
vocab_gene_list_results

[{'query': '5S_rRNA_ENSG00000276861', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000277411', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000277488', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000278457', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000285609', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000285626', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000285674', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000285776', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000285912', 'notfound': True},
 {'query': '5S_rRNA_ENSG00000288601', 'notfound': True},
 {'query': '5_8S_rRNA_ENSG00000273730', 'notfound': True},
 {'query': '5_8S_rRNA_ENSG00000275757', 'notfound': True},
 {'query': '5_8S_rRNA_ENSG00000275877', 'notfound': True},
 {'query': '5_8S_rRNA_ENSG00000276871', 'notfound': True},
 {'query': '5_8S_rRNA_ENSG00000277739', 'notfound': True},
 {'query': '5_8S_rRNA_ENSG00000278294', 'notfound': True},
 {'query': '5_8S_rRNA_ENSG00000283274', 'notfound': True},
 {'query': '5_8S_

In [21]:
token_dictionary_results

[{'query': '<mask>', 'notfound': True},
 {'query': '<pad>', 'notfound': True},
 {'query': 'ENSG00000000003',
  '_id': '7105',
  '_score': 26.872438,
  'symbol': 'TSPAN6'},
 {'query': 'ENSG00000000005',
  '_id': '64102',
  '_score': 27.7306,
  'symbol': 'TNMD'},
 {'query': 'ENSG00000000419',
  '_id': '8813',
  '_score': 27.73056,
  'symbol': 'DPM1'},
 {'query': 'ENSG00000000457',
  '_id': '57147',
  '_score': 27.730627,
  'symbol': 'SCYL3'},
 {'query': 'ENSG00000000460',
  '_id': '55732',
  '_score': 27.730616,
  'symbol': 'FIRRM'},
 {'query': 'ENSG00000000938',
  '_id': '2268',
  '_score': 27.730633,
  'symbol': 'FGR'},
 {'query': 'ENSG00000000971',
  '_id': '3075',
  '_score': 27.730633,
  'symbol': 'CFH'},
 {'query': 'ENSG00000001036',
  '_id': '2519',
  '_score': 27.730707,
  'symbol': 'FUCA2'},
 {'query': 'ENSG00000001084',
  '_id': '2729',
  '_score': 27.73056,
  'symbol': 'GCLC'},
 {'query': 'ENSG00000001167',
  '_id': '4800',
  '_score': 27.730627,
  'symbol': 'NFYA'},
 {'query'