# Query NCBI's Entrez Programming Utilities (API)

In [1]:
from Bio import Entrez
import re

Entrez.email = "thaliasingh25@gmail.com"  # your email for NCBI

rsid = "rs11540652"   # dbSNP ID for TP53 R248Q

# 1. Search ClinVar using rsID
handle = Entrez.esearch(db="clinvar", term=rsid)
record = Entrez.read(handle)
handle.close()

id_list = record["IdList"]
print("ClinVar IDs found:", id_list)

if not id_list:
    raise SystemExit("No ClinVar entries found for this rsID.")

# 2. Loop over IDs, find the one with p.Arg248Gln
r248q_id = None
for cid in id_list:
    summary = Entrez.esummary(db="clinvar", id=cid, retmode="xml")
    data = Entrez.read(summary, validate=False)
    docsum = data["DocumentSummarySet"]["DocumentSummary"][0]
    title = docsum["title"]  # e.g. "NM_000546.5(TP53):c.743G>A (p.Arg248Gln)"
    print(cid, "→", title)

    if "p.Arg248Gln" in title or "Arg248Gln" in title:
        r248q_id = cid
        r248q_docsum = docsum   # keep this one
        break

if r248q_id is None:
    raise SystemExit("Could not find a ClinVar entry with p.Arg248Gln in the title.")

print("\nUsing ClinVar ID for TP53 R248Q:", r248q_id)

# 3. Extract annotations from the chosen DocumentSummary
docsum = r248q_docsum

# Clinical significance (germline classification)
clinical_sig = docsum["germline_classification"]["description"]

# HGVS cDNA + protein change from title (robust parsing)
title = docsum["title"]
match_cdna = re.search(r"c\.[0-9]+[ACGT]?>[ACGT]?", title)
match_protein = re.search(r"p\.[A-Za-z0-9]+", title)

hgvs_cdna = match_cdna.group(0) if match_cdna else "NA"
protein_change = match_protein.group(0) if match_protein else "NA"

# Associated conditions (trait_set)
conditions = [t["trait_name"] for t in docsum.get("trait_set", [])
              if "trait_name" in t]

print("\nFinal annotation for TP53 R248Q:")
print("Clinical significance:", clinical_sig)
print("HGVS cDNA:", hgvs_cdna)
print("Protein change:", protein_change)
print("Conditions:", "; ".join(conditions) if conditions else "None listed")

ClinVar IDs found: ['237954', '230253', '12356']
237954 → NM_000546.6(TP53):c.743G>C (p.Arg248Pro)
230253 → NM_000546.6(TP53):c.743G>T (p.Arg248Leu)
12356 → NM_000546.6(TP53):c.743G>A (p.Arg248Gln)

Using ClinVar ID for TP53 R248Q: 12356

Final annotation for TP53 R248Q:
Clinical significance: Pathogenic
HGVS cDNA: c.743G>A
Protein change: p.Arg248Gln
Conditions: None listed
