In [17]:
import requests
import json
import emoji
import re
import html  # <-- to decode HTML entities
ENSEMBL_API = "https://rest.ensembl.org"

In [28]:
import time

def get_variant_phenotype(variant_id, retries=3):
    url = f"https://rest.ensembl.org/variation/homo_sapiens/{variant_id}?phenotypes=1"
    headers = {"Content-Type": "application/json"}

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"⚠️ Error on {variant_id} (attempt {attempt+1}): {e}")
            time.sleep(1)  # wait a bit before retry

    print(f"❌ Failed to fetch phenotype for {variant_id} after {retries} tries.")
    return {}


In [29]:
def extract_traits_from_variant(json_data):
    traits = []

    if not json_data or "phenotypes" not in json_data:
        return traits

    for item in json_data["phenotypes"]:
        try:
            trait = item.get("trait", "N/A")
            traits.append(trait)
        except:
            continue

    return traits


In [30]:
def get_genes(phenotype_term, species):
    phenotype = phenotype_term
    species = species

    url = f"{ENSEMBL_API}/phenotype/term/{species}/{phenotype_term}?content-type=application/json"
    response = requests.get(url)
    if response.status_code == 200:
      data = response.json()
      genes = [entry["Gene"] for entry in data if "Gene" in entry]
      genes = genes + [entry["Variation"] for entry in data if "Variation" in entry]
      return genes
    else:
      print(emoji.emojize(":warning: "),f"Error fetching genes for {phenotype_term}: {response.status_code}")
      return []

In [31]:
def get_ld_variants(species, variant_id, population):
    """
    Fetch LD variants and return structured JSON:
    {
        "variant1": [
            {"variant2": ..., "d_prime": ..., "r2": ...},
            ...
        ]
    }
    """
    url = f"https://rest.ensembl.org/ld/{species}/{variant_id}/{population}?d_prime=1.0;r2=0.85"
    headers = {"Content-Type": "application/json"}

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json()

        results = []
        for entry in data:
            results.append({
                "variant2": entry.get("variation2"),
                "d_prime": float(entry.get("d_prime", 0)),
                "r2": float(entry.get("r2", 0))
            })

        return {variant_id: results}

    except requests.exceptions.HTTPError as e:
        print(f"⚠️ HTTP error for {variant_id}: {e}")
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Request error for {variant_id}: {e}")
    except ValueError:
        print(f"⚠️ Could not parse JSON for {variant_id}")
    return {variant_id: []}


In [32]:
variant_list = get_genes("huntington's_disease", "homo_sapiens")

In [33]:
variant_list

['LRG_763',
 'ENSG00000197386',
 'ENSG00000059804',
 'ENSG00000197386',
 'rs12668183',
 'rs780493120',
 'rs79029191',
 'rs114643193',
 'rs28406206',
 'rs1712396191',
 'rs1232027',
 'rs71180116',
 'rs72715653',
 'rs8031584',
 'rs150393409',
 'rs78621558',
 'rs4736525',
 'rs1721646894',
 'rs114688092',
 'rs73786719',
 'rs6882169',
 'rs150393409',
 'rs1560534953',
 'rs932428',
 'rs116220136',
 'rs8031584',
 'rs116293982',
 'rs3013648',
 'rs4720024',
 'rs71180116',
 'rs11197481',
 'rs79218467',
 'rs117440785',
 'rs3889139',
 'rs80260687',
 'rs117933444']

In [34]:
ld_storage = {}

for variant_id in variant_list:  # don't use `list` as a variable name!
    result = get_ld_variants("homo_sapiens", variant_id, "1000GENOMES:phase_3:KHV")
    if result and result.get(variant_id):
        ld_storage[variant_id] = result[variant_id]


⚠️ HTTP error for LRG_763: 400 Client Error: Bad Request for url: https://rest.ensembl.org/ld/homo_sapiens/LRG_763/1000GENOMES:phase_3:KHV?d_prime=1.0;r2=0.85
⚠️ HTTP error for ENSG00000197386: 400 Client Error: Bad Request for url: https://rest.ensembl.org/ld/homo_sapiens/ENSG00000197386/1000GENOMES:phase_3:KHV?d_prime=1.0;r2=0.85
⚠️ HTTP error for ENSG00000059804: 400 Client Error: Bad Request for url: https://rest.ensembl.org/ld/homo_sapiens/ENSG00000059804/1000GENOMES:phase_3:KHV?d_prime=1.0;r2=0.85
⚠️ HTTP error for ENSG00000197386: 400 Client Error: Bad Request for url: https://rest.ensembl.org/ld/homo_sapiens/ENSG00000197386/1000GENOMES:phase_3:KHV?d_prime=1.0;r2=0.85
⚠️ HTTP error for rs79218467: 502 Server Error: Bad Gateway for url: https://rest.ensembl.org/ld/homo_sapiens/rs79218467/1000GENOMES:phase_3:KHV?d_prime=1.0;r2=0.85
⚠️ HTTP error for rs117440785: 502 Server Error: Bad Gateway for url: https://rest.ensembl.org/ld/homo_sapiens/rs117440785/1000GENOMES:phase_3:KHV?d_pr

In [35]:
ld_storage.items()

dict_items([('rs12668183', [{'variant2': 'rs12699801', 'd_prime': 1.0, 'r2': 0.958916}, {'variant2': 'rs11766828', 'd_prime': 1.0, 'r2': 0.979114}, {'variant2': 'rs10085439', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs10085777', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs10499469', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs10085489', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs9791820', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs2141804', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs7798302', 'd_prime': 1.0, 'r2': 0.979114}, {'variant2': 'rs12699796', 'd_prime': 1.0, 'r2': 1.0}]), ('rs1232027', [{'variant2': 'rs1222809', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs59724253', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs34931264', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs1643635', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs113903049', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs73765837', 'd_prime': 1.0, 'r2': 1.0}, {'variant2': 'rs112325102', 'd_prime': 1.0, 'r2': 1.0}, {'varian

In [36]:
linked_pheno_map = {}

for variant1, links in ld_storage.items():
    print(f"assessing {variant1}")
    phenos = set()

    if links:
        for link in links:
            variant2 = link["variant2"]
            print(f"  → checking {variant2}")
            json_data = get_variant_phenotype(variant2)
            traits = extract_traits_from_variant(json_data)
            phenos.update(traits)

    if phenos:
        linked_pheno_map[variant1] = list(phenos)
        print(f"✅ {variant1} linked to traits: {phenos}")

assessing rs12668183
  → checking rs12699801
  → checking rs11766828
  → checking rs10085439
  → checking rs10085777
  → checking rs10499469
  → checking rs10085489
  → checking rs9791820
  → checking rs2141804
  → checking rs7798302
  → checking rs12699796
assessing rs1232027
  → checking rs1222809
  → checking rs59724253
  → checking rs34931264
  → checking rs1643635
  → checking rs113903049
  → checking rs73765837
  → checking rs112325102
  → checking rs11951910
  → checking rs35124495
  → checking rs73125358
  → checking rs13185915
  → checking rs60186226
  → checking rs10040918
  → checking rs1019450
  → checking rs111692241
  → checking rs11951283
  → checking rs249212
  → checking rs71603550
  → checking rs139493525
  → checking rs73765836
  → checking rs185784473
assessing rs4736525
  → checking rs7834499
  → checking rs2270879
  → checking rs7000712
  → checking rs12678949
  → checking rs4736526
  → checking rs4736521
  → checking rs12550644
  → checking rs10956624
  → checkin

In [37]:
linked_pheno_map

{'rs117933444': ['Smooth-surface caries']}