In [1]:
import requests
import urllib.parse
import pandas as pd

In [3]:

def get_clinvar_data(ucsc_coords: str, assembly="GRCh38"):
    """
    Fetches ClinVar data for a given UCSC-style chromosome location.

    Args:
        ucsc_coords (str): Chromosome coordinates in UCSC format (e.g., 'chr2:12345-12345').
        assembly (str): Genome assembly ('GRCh38' or 'GRCh37').

    Returns:
        dict: JSON response from ClinVar if successful, otherwise None.
    """
    base_url = "https://www.ncbi.nlm.nih.gov/clinvar/variation/search/"

    # Parse the UCSC coordinate format (e.g., 'chr2:12345-67890')
    try:
        chrom, pos_range = ucsc_coords.replace("chr", "").split(":")
        start, end = pos_range.split("-")
    except ValueError:
        print("Invalid UCSC coordinate format. Expected format: 'chr2:12345-67890'")
        return None

    # Construct the query for ClinVar
    query = f"({chrom}[CHR] AND {start}:{end}[CPOS])"

    # URL encode the query
    encoded_query = urllib.parse.quote(query, safe="()[]:")

    # Construct the full URL with the appropriate genome assembly
    assembly_id = "GCF_000001405.38" if assembly == "GRCh38" else "GCF_000001405.25"
    full_url = f"{base_url}?term={encoded_query}&assembly={assembly_id}"

    headers = {
        "accept": "application/json, text/javascript, */*; q=0.01",
        "accept-language": "en-US,en;q=0.9",
        "Referer": "https://www.ncbi.nlm.nih.gov/clinvar/",
    }

    try:
        response = requests.get(full_url, headers=headers)
        response.raise_for_status()  # Raise an error for HTTP issues
        return response.json()  # Return JSON response
    except requests.exceptions.RequestException as e:
        print(f"Error fetching ClinVar data: {e}")
        return None



In [4]:
# Example usage
ucsc_input = "chr2:12345-67890"
result = get_clinvar_data(ucsc_input)



In [5]:
print(result.keys())

dict_keys(['vars', 'genes', 'chr_info', 'query'])


In [6]:
#get a variant
vars = result["vars"]
print(vars[0])

{'id': 155714, 'ci': 'Uncertain significance', 'revstat': 'no assertion criteria provided', 'desc': '4.3Mb copy number gain', 'locs': [{'from': 12770, 'to': 4318861, 'assm': 'GCF_000001405.38', 'chr': '2'}]}


In [7]:
#dataframe of variants
variant_df = pd.DataFrame(vars)
variant_df.head()

Unnamed: 0,id,ci,revstat,desc,locs
0,155714,Uncertain significance,no assertion criteria provided,4.3Mb copy number gain,"[{'from': 12770, 'to': 4318861, 'assm': 'GCF_0..."
1,155609,Uncertain significance,no assertion criteria provided,221.8kb copy number loss,"[{'from': 12770, 'to': 234590, 'assm': 'GCF_00..."
2,153520,Pathogenic,no assertion criteria provided,25.0Mb copy number gain,"[{'from': 12770, 'to': 25039694, 'assm': 'GCF_..."
3,153475,Pathogenic,no assertion criteria provided,2.7Mb copy number loss,"[{'from': 12770, 'to': 2748672, 'assm': 'GCF_0..."
4,153441,Pathogenic,no assertion criteria provided,33.7Mb copy number gain,"[{'from': 12770, 'to': 33711509, 'assm': 'GCF_..."


In [None]:
count_variant = variant_df['ci'].value_counts()
print(count_variant)

SyntaxError: unmatched ')' (1510544452.py, line 1)