## Use case 1: Obtain germline allele frequencies given gene and mutation entities
- Test passing gene and mutation entities to the gnomad API to retrieve germline frequencies
- documentation here https://gnomad.broadinstitute.org/data#api
- needs gql, a python graphQL client installed (which depends on aiohttp)
- pip install gql and aiohttp in your env 

In [82]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # note the GPU index as a string 

In [1]:
import sys
from pathlib import Path

# this notebook is in notebooks — go up one level
project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [38]:
import requests
import pandas as pd
import numpy as np
from methods.gdc_api_calls import get_ssm_id
from gql import gql, Client
from gql.transport.aiohttp import AIOHTTPTransport

In [3]:
transport = AIOHTTPTransport(url="https://gnomad.broadinstitute.org/api")
client = Client(transport=transport, fetch_schema_from_transport=True)



In [4]:
async def get_germline_af(input):
  query = gql(
    """
    query VariantsInGene($gene_symbol: String!) {
      gene(gene_symbol: $gene_symbol, reference_genome: GRCh38) {
        variants(dataset: gnomad_r4) {
          variant_id
          pos
          exome {
            ac
            ac_hemi
            ac_hom
            an
            af
          }
        }
      }
    }
    """
  )
  result = await client.execute_async(query, variable_values=input)
  return result


In [5]:
def get_start_position(ssm_id):
    ssms_endpt = 'https://api.gdc.cancer.gov/ssms/{}?fields=start_position'.format(ssm_id)
    output = requests.get(ssms_endpt).json()
    start_pos = output['data']['start_position']
    return start_pos

In [6]:
async def compute_gnomad_af(gene, mutation):
    germline_af = {}
    ssm_id = get_ssm_id(gene=gene, mutation=mutation)
    start_position = get_start_position(ssm_id=ssm_id)
    input = {"gene_symbol": gene }
    result = await get_germline_af(input)
    for d in result['gene']['variants']:
        if d['pos'] == start_position:
            germline_af[d['variant_id']] = d['exome']['af']
    return germline_af

In [7]:
# super low germline frequency as expected for the G12D variant
await compute_gnomad_af(gene='KRAS', mutation='G12D')

{'12-25245350-C-G': 1.369332163010778e-06,
 '12-25245350-C-T': 2.7386718263243876e-06,
 '12-25245350-C-A': 6.84666081505389e-07}

## Use case 2: Get all publications for a variant given the rsid
- using rsid returned by the GDC /ssms endpt get all publications for a variant using litvar2 API
- e.g. https://api.gdc.cancer.gov/ssms/edd1ae2c-3ca9-52bd-a124-b09ed304fcc2?fields=consequence.transcript.annotation.dbsnp_rs

In [105]:
def get_variant_pubs(gene, mutation):
    pmids = {}
    ssm_id = get_ssm_id(gene=gene, mutation=mutation)
    ssms_endpt = 'https://api.gdc.cancer.gov/ssms/{}?fields=consequence.transcript.annotation.dbsnp_rs'.format(ssm_id)
    output = requests.get(ssms_endpt).json()
    dbsnp_rs_ids = []
    for item in output['data']['consequence']:
        dbsnp_rs_ids.append(item['transcript']['annotation']['dbsnp_rs'])
    dbsnp_rs_ids = set(dbsnp_rs_ids)
    for rsid in dbsnp_rs_ids:
        variant_id = 'litvar@' + rsid + '%23%23'
        endpt = 'https://www.ncbi.nlm.nih.gov/research/litvar2-api/variant/get/{}/publications'.format(variant_id)
        output = requests.get(endpt).json()
        pmids[rsid] = {'pmids': output['pmids'], 'pmids_count': output['pmids_count']}
    # print output info
    for k in pmids.keys():
        print('number of pmids for {}: {}'.format(k, pmids[k]['pmids_count']))
    return pmids

In [106]:
pmids = get_variant_pubs(gene='KRAS', mutation='G12D')

number of pmids for rs121913529: 13163


In [112]:
pd.DataFrame(pmids).T.explode('pmids').iloc[:, :1]

Unnamed: 0,pmids
rs121913529,28639239
rs121913529,32604167
rs121913529,29720585
rs121913529,18645002
rs121913529,32636940
...,...
rs121913529,31817717
rs121913529,31358966
rs121913529,36634615
rs121913529,25985019


### Use case 3: Query civic to get therapies for a variant
- Consider gene and mutation entities, KRAS G12D
- https://api.gdc.cancer.gov/ssms/edd1ae2c-3ca9-52bd-a124-b09ed304fcc2?fields=clinical_annotations.civic.variant_id
- query civic API using these entities and format results in a df

In [77]:
transport = AIOHTTPTransport(url="https://civicdb.org/api/graphql")
client = Client(transport=transport, fetch_schema_from_transport=True)



In [78]:
async def get_therapies(input):
    query = gql(
        """
        query gene_variant_by_name($entrezSymbol: String!, $name: String!){
            gene(entrezSymbol: $entrezSymbol) {
                variants(name: $name) {
                    nodes {
                        name
                        id
                        link
                        molecularProfiles {
                            nodes {
                                id
                                name
                                evidenceItems {
                                    nodes {
                                        id
                                        status
                                        phenotypes {
                                            id
                                            hpoId
                                            name
                                        }
                                        description
                                        therapies {
                                            id
                                            ncitId
                                            name
                                            therapyAliases
                                        }
                                        source {
                                            ascoAbstractId
                                            citationId
                                            pmcId
                                            sourceType
                                            title
                                        }
                                        therapyInteractionType
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        """
    )
    result = await client.execute_async(query, variable_values=input)
    return result


In [79]:
input = {'entrezSymbol': 'KRAS', 'name': 'G12D'}
result = await get_therapies(input)

In [80]:
# format data for df
df_data = {}
df_data['id'] = []
df_data['name'] = []
df_data['therapies'] = []
for item in result['gene']['variants']['nodes'][0]['molecularProfiles']['nodes']:
    df_data['id'].append(item['id'])
    df_data['name'].append(item['name'])
    for evidence_items in item['evidenceItems']['nodes']:
        therapy_list = [ therapy_details['name'] for therapy_details in evidence_items['therapies']]
    df_data['therapies'].append(therapy_list)


In [81]:
pd.DataFrame(df_data).explode('therapies')

Unnamed: 0,id,name,therapies
0,79,KRAS G12D,KRAS G12D Inhibitor HRS-4642
1,4571,KRAS G12D AND MET Splice Site (c.2888_3028del),Crizotinib
2,4632,KRAS G12D AND ERBB2 S310F,
3,4634,KRAS G12D AND ERBB2 S423R,
4,4635,ERBB2 R678Q AND KRAS G12D,
5,4638,KRAS G12D AND ERBB2 Q679L,
6,4640,KRAS G12D AND ERBB2 E717D,
7,4641,ERBB2 L755S AND KRAS G12D,
8,4642,ERBB2 V777L AND KRAS G12D,
9,4643,ERBB2 V842I AND KRAS G12D,
