In [1]:
import json
import numpy as np

## Gene summary GDC REST API method

In [2]:
# import gene summary -- using GDC REST API method
!curl "https://api.gdc.cancer.gov/genes/ENSG00000164199?pretty=true&format=json" -o adgrv1.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1081  100  1081    0     0   7244      0 --:--:-- --:--:-- --:--:--  7404


In [3]:
with open('adgrv1.json') as adgrv1_summary:
    gene_summary = json.load(adgrv1_summary)


In [4]:
gene_summary 

{'data': {'biotype': 'protein_coding',
  'symbol': 'ADGRV1',
  'canonical_transcript_length_genomic': 605641,
  'cytoband': ['5q14.3'],
  'canonical_transcript_length': 19557,
  'synonyms': ['DKFZp761P0710',
   'FEB4',
   'GPR98',
   'KIAA0686',
   'MASS1',
   'USH2C',
   'VLGR1'],
  'description': 'This gene encodes a member of the G-protein coupled receptor superfamily. The encoded protein contains a 7-transmembrane receptor domain, binds calcium and is expressed in the central nervous system. Mutations in this gene are associated with Usher syndrome 2 and familial febrile seizures. Several alternatively spliced transcripts have been described. [provided by RefSeq, Jul 2008]',
  'canonical_transcript_length_cds': 18918,
  'gene_chromosome': '5',
  'gene_end': 91164437,
  'canonical_transcript_id': 'ENST00000405460',
  'gene_strand': 1,
  'gene_start': 90529344,
  'name': 'adhesion G protein-coupled receptor V1',
  'gene_id': 'ENSG00000164199'},

In [5]:
gene_summary['data']['symbol']

'ADGRV1'

In [6]:
json_to_text = "The gene {} is a {} {} on cytoband {} with a canonical transcript length of {}. Synonyms for {} are {}.{}".format(
    gene_summary['data']['symbol'], 
    gene_summary['data']['biotype'],
    gene_summary['data']['name'],
    gene_summary['data']['cytoband'][0],
    gene_summary['data']['canonical_transcript_length'],
    gene_summary['data']['symbol'], 
    ','.join(gene_summary['data']['synonyms']),
    gene_summary['data']['description'],
)


In [7]:
json_to_text

'The gene ADGRV1 is a protein_coding adhesion G protein-coupled receptor V1 on cytoband 5q14.3 with a canonical transcript length of 19557. Synonyms for ADGRV1 are DKFZp761P0710,FEB4,GPR98,KIAA0686,MASS1,USH2C,VLGR1.This gene encodes a member of the G-protein coupled receptor superfamily. The encoded protein contains a 7-transmembrane receptor domain, binds calcium and is expressed in the central nervous system. Mutations in this gene are associated with Usher syndrome 2 and familial febrile seizures. Several alternatively spliced transcripts have been described. [provided by RefSeq, Jul 2008]'

In [8]:
with open('adgrv1_blurb.txt', 'a') as gene_blurb:
    gene_blurb.write(json_to_text)
    gene_blurb.write('\n')

## Gene summary GDC API/Python 

In [9]:
import requests

In [10]:
genes_endpt = "https://api.gdc.cancer.gov/genes"
# get default fields, not specifying fields

In [11]:
filters = {
 "op": "in",
 "content": {
   "field": "gene_id",
   "value": ["ENSG00000164199"]
 }
}

In [12]:
params = {
  "filters": json.dumps(filters),
  "format": "JSON"
}

In [13]:
response = requests.get(genes_endpt, params = params)

In [14]:
print(json.dumps(response.json(), indent=2))

{
  "data": {
    "hits": [
      {
        "id": "ENSG00000164199",
        "biotype": "protein_coding",
        "symbol": "ADGRV1",
        "canonical_transcript_length_genomic": 605641,
        "cytoband": [
          "5q14.3"
        ],
        "canonical_transcript_length": 19557,
        "synonyms": [
          "DKFZp761P0710",
          "FEB4",
          "GPR98",
          "KIAA0686",
          "MASS1",
          "USH2C",
          "VLGR1"
        ],
        "description": "This gene encodes a member of the G-protein coupled receptor superfamily. The encoded protein contains a 7-transmembrane receptor domain, binds calcium and is expressed in the central nervous system. Mutations in this gene are associated with Usher syndrome 2 and familial febrile seizures. Several alternatively spliced transcripts have been described. [provided by RefSeq, Jul 2008]",
        "canonical_transcript_length_cds": 18918,
        "gene_chromosome": "5",
        "gene_end": 91164437,
        "canoni

## Get all ssm in a gene, GDC API method

In [15]:
ssms_endpt = "https://api.gdc.cancer.gov/ssms"

In [16]:
# lets see if we can filter based on gene start and gene end, from gene summary
# gene start = 90529344
# gene end = 91164437
# gene chromosome = 5

"""
Note: if you set fields to this list 
and pass it to params {}, it will only
show these in the output

fields = [\"chromosome\"]

fields = ",".join(fields)
"""


'\nNote: if you set fields to this list \nand pass it to params {}, it will only\nshow these in the output\n\nfields = ["chromosome"]\n\nfields = ",".join(fields)\n'

In [17]:
#filters includes format, field and size, see page 10 of GDC API guide
# you can pass these in the params dict

filters = {
 "op": "and",
    "content": [
        {
            "op": ">=",
            "content": {
                "field": "start_position",
                "value": "90529344"
            }
       },
       {
           "op": "<=",
           "content": {
               "field": "end_position",
               "value": "91164437"
           }
        },
        {
            "op": "=",
            "content": {
                "field": "chromosome",
                "value": "chr5"
            }
        }
    ]
}


In [18]:
params = {
    "filters": json.dumps(filters),
    #"fields": fields,
    # somehow passing this below doesn't filter expand
    # results by is_canonical
    #"fields": "consequence.transcript.is_canonical", 
    "expand": "consequence.transcript,consequence.transcript.annotation,occurrence.case,occurrence.case.demographic,occurrence.case.diagnoses,occurrence.case.diagnoses.pathology_details,occurrence.case.diagnoses.treatments,occurrence.case.family_histories,occurrence.case.project,occurrence.case.exposures",
    "response": "JSON",
    "size": 2000
}

In [19]:
response = requests.get(ssms_endpt, params=params)

In [20]:
# dump output to file for inspection
res = json.dumps(response.json(), indent=2)
with open('dump.json', 'w') as res_dump:
    res_dump.write(res)

In [21]:
print(json.dumps(response.json(), indent=2))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [22]:
# lets try to convert info in res to some meaningful text

In [23]:
res_data = response.json()

In [24]:
len(res_data['data']['hits'])

1648

## fun part: translate that information into meaningful text

In [25]:
# need to pass gene name as arg
mutation_summary = "There are {} somatic mutations reported in the Genomic Data Commons in gene ADGRV1, located on chromosome {}".format(
    len(res_data['data']['hits']),
    res_data['data']['hits'][0]['chromosome']
)

In [26]:
mutation_summary

'There are 1648 somatic mutations reported in the Genomic Data Commons in gene ADGRV1, located on chromosome chr5'

In [27]:
occurrence_list = []

In [28]:
for idx, h in enumerate(res_data['data']['hits']):
    occurrence_list = []
    treatment_list = []
    chromosomal_pos = h['genomic_dna_change']
    occurrence_list.append("Genomic DNA change {} at start position {} is a {}. ".format(
         h['genomic_dna_change'], h['start_position'], h['mutation_subtype']
    ))
    for idx2, h2 in enumerate(h['gene_aa_change']):
        gene_name, aa_change = h2.split(' ')
    for idx3, consequence in enumerate(h['consequence']):
        try:
            consequence['transcript']['aa_change']
            occurrence_list.append("{} corresponds to amino acid change {} in gene {} with " \
                                   "transcript_id {} and refseq accession {}. " \
                                   "Consequences of {} mutation in transcript_id {}, specifically " \
                                   "the amino acid change {} is a {} with {} VEP impact. " \
                                   "hgvsc annotation for {} is {}. hgvsp annotation for {} is {} .".format(
                chromosomal_pos,
                consequence['transcript']['aa_change'],
                gene_name,
                consequence['transcript']['transcript_id'],
                consequence['transcript']['ref_seq_accession'],
                chromosomal_pos,
                consequence['transcript']['transcript_id'],
                consequence['transcript']['aa_change'],
                consequence['transcript']['consequence_type'],
                consequence['transcript']['annotation']['vep_impact'],
                chromosomal_pos,
                consequence['transcript']['annotation']['hgvsc'],
                chromosomal_pos,
                consequence['transcript']['annotation']['hgvsp']
            ))
        except Exception as e:
            pass
        if consequence['transcript']['is_canonical']:
            occurrence_list.append("Transcript {} is a canonical transcript for gene {}. ".format(
                consequence['transcript']['transcript_id'],
                gene_name
            ))
    try:
        for idx4, occurrence in enumerate(h['occurrence']):
            occurrence_list.append(
                "In the cases that report {}, alcohol history is {}. ".format(
                chromosomal_pos,
                occurrence['case']['exposures'][idx4]['alcohol_history']))
            occurrence_list.append(
                "The ethnicity of cases reported with {} is {}. ".format(
                chromosomal_pos,
                occurrence['case']['demographic']['ethnicity']
                ))
            occurrence_list.append(
                "{} is seen in {} {}. ".format(
                    chromosomal_pos,
                    occurrence['case']['primary_site'],
                    occurrence['case']['disease_type']
                ))
            for idx5, diag in enumerate(occurrence['case']['diagnoses']):
                occurrence_list.append(
                    "{} is diagnosed in ajcc pathologic stage {} with a primary diagnosis of {}. ".format(
                    chromosomal_pos,
                    diag['ajcc_pathologic_stage'],
                    diag['primary_diagnosis']))
                occurrence_list.append(
                    "Other ajcc pathologic T, N and M stages for {} are reported to be {}, {} and {} respectively. ".format(
                    chromosomal_pos,
                    diag['ajcc_pathologic_t'],
                    diag['ajcc_pathologic_n'],
                    diag['ajcc_pathologic_m']))
                occurrence_list.append(
                    "The tissue or organ of origin for {} is {}. ".format(
                    chromosomal_pos,
                    diag['tissue_or_organ_of_origin']
                ))
                for idx6, treatments in enumerate(diag['treatments']):
                    treatment_list.append(treatments['treatment_type'])
            
                occurrence_list.append(
                    "Treatment type for {} and primary diagnosis of {} is {}. ".format(
                        chromosomal_pos,
                        diag['primary_diagnosis'],
                        ' '.join(treatment_list)))
                
        res,ind = np.unique(occurrence_list, return_index=True)
        per_variant_blurb = res[np.argsort(ind)]
        with open('adgrv1_blurb.txt', 'a') as gene_blurb:
            gene_blurb.write(' '.join(per_variant_blurb))
            gene_blurb.write('\n')
        #print(' '.join(per_variant_blurb))
    except Exception as e:
        pass
  

## dummy example

In [29]:
fields = [
"submitter_id",
"case_id",
"primary_site",
"disease_type",
"diagnoses.vital_status"
]

fields = ",".join(fields)

cases_endpt = "https://api.gdc.cancer.gov/cases"

filters = {
 "op": "in",
 "content":{
 "field": "primary_site",
 "value": ["Kidney"]
}
}


params = {
"filters": json.dumps(filters),
"fields": fields,
"format": "JSON",
"size": "100"
}

response = requests.get(cases_endpt, params = params)

#print(json.dumps(response.json(), indent=2))