In [1]:
import requests
import json
import numpy as np 
import pandas as pd

In [2]:
cases_endpt = 'https://api.gdc.cancer.gov/cases'

In [3]:
files_endpt = "https://api.gdc.cancer.gov/files"

In [4]:
import requests
import json

fields = [
    "file_name",
    "cases.submitter_id",
    "cases.samples.sample_type",
    "cases.disease_type",
    "cases.project.project_id"
    ]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/files"

# This set of filters is nested under an 'and' operator.
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Lung"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["BAM"]
            }
        }
    ]
}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "5"
    }

# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

print(response.content.decode("utf-8"))

cases.0.disease_type	cases.0.project.project_id	cases.0.samples.0.sample_type	cases.0.submitter_id	file_name	id
Squamous Cell Neoplasms	TCGA-LUSC	Primary Tumor	TCGA-85-6561	8a52e74d-e830-4c13-81e1-caaf032b4549_gdc_realn_rehead.bam	73e7a49e-87fd-4809-a076-1394a1a619a2
Squamous Cell Neoplasms	TCGA-LUSC	Primary Tumor	TCGA-77-8144	fc131a10-fa8f-45da-8732-642cc45c91e6_gdc_realn_rehead.bam	a27ca6ec-7c8b-45cf-9daf-0bff9fdb66f0
Squamous Cell Neoplasms	TCGA-LUSC	Primary Tumor	TCGA-18-3411	ddcab400-d9a0-40da-907f-adc5805c6bbb_gdc_realn_rehead.bam	e0979f1e-e816-4322-b1e1-cde921032091
Squamous Cell Neoplasms	TCGA-LUSC	Solid Tissue Normal	TCGA-56-7582	8ef3c107-48d0-4203-820f-5b70394fe261_gdc_realn_rehead.bam	ddeeaa88-e030-4d0b-9a46-531323461bb9
Squamous Cell Neoplasms	TCGA-LUSC	Primary Tumor	TCGA-43-7658	9560e4b7-5f34-4139-8280-faf4e5b97a22_gdc_realn_rehead.bam	03bb0e2c-7a1c-4195-92a2-7385b19f87f9



In [119]:
#writing my own actual request - 
# goal - get RNA-seq for primary tumor, caseid, primary site, drug used, progression free days or equivalent

#first table just need to be caseid, primary site for files that have rna seq and drug used 

fields_table1 = [
    "case_id",
    "primary_site",
    "diagnoses.days_to_recurrence",
    "diagnoses.last_known_disease_status",
    "diagnoses.progression_or_recurrence",
    "diagnoses.treatments.therapeutic_agents",
    "diagnoses.treatments.treatment_or_therapy",
    "diagnoses.vital_status"
    ]

array_reshape_value = len(fields_table1)
fields_table1 = ",".join(fields_table1)

In [120]:
filters_table1 = {
    "op":"and",
    "content":[{"op":"in", 
                "content":{
        "field":"primary_site",
        "value": ["Kidney"]}}, 
                {"op":"in",
                 "content":{
                     "field":"files.experimental_strategy", 
                     "value":["RNA-Seq"]}}]}

In [121]:
params_table1 = {
    "filters": json.dumps(filters_table1),
    "fields": fields_table1,
    "format": "TSV",
    "size": "2"
    }

In [122]:
response_table1 = requests.get(cases_endpt, params = params_table1)
response_table1_1 = requests.post(cases_endpt, headers = {"Content-Type": "application/json"}, json = params)


In [123]:
clinical_data = response_table1.content.decode("utf-8")
clinical_data

'case_id\tdiagnoses.0.days_to_recurrence\tdiagnoses.0.last_known_disease_status\tdiagnoses.0.progression_or_recurrence\tdiagnoses.0.treatments.0.therapeutic_agents\tdiagnoses.0.treatments.0.treatment_or_therapy\tdiagnoses.0.treatments.1.therapeutic_agents\tdiagnoses.0.treatments.1.treatment_or_therapy\tid\tprimary_site\r\n5338d435-68fb-4f0d-a3e6-c843f703f75f\t\tnot reported\tnot reported\t\tno\t\tno\t5338d435-68fb-4f0d-a3e6-c843f703f75f\tKidney\r\n22b6724c-a59f-4796-8166-992253e8caf1\t\tnot reported\tnot reported\t\tnot reported\t\tnot reported\t22b6724c-a59f-4796-8166-992253e8caf1\tKidney\r\n'

In [124]:
clinical_data_processed = clinical_data.replace("\t", ",")
clinical_data_processed = clinical_data_processed.replace("\r\n", ",")

In [125]:
cd = clinical_data_processed.split(",")

In [126]:
len(cd)

31

In [127]:
cd_array = np.asarray(cd[0:-1]) # not sure how to correctly just deal with this one last element

In [129]:
cd_array = cd_array.reshape(-1, 10)

In [130]:
cd_array[0]

array(['case_id', 'diagnoses.0.days_to_recurrence',
       'diagnoses.0.last_known_disease_status',
       'diagnoses.0.progression_or_recurrence',
       'diagnoses.0.treatments.0.therapeutic_agents',
       'diagnoses.0.treatments.0.treatment_or_therapy',
       'diagnoses.0.treatments.1.therapeutic_agents',
       'diagnoses.0.treatments.1.treatment_or_therapy', 'id',
       'primary_site'], dtype='<U45')

In [131]:
cd_df = pd.DataFrame(cd_array[1:], columns=cd_array[0])

In [132]:
cd_df #I'm not sure why the vital status is not showing up? 

Unnamed: 0,case_id,diagnoses.0.days_to_recurrence,diagnoses.0.last_known_disease_status,diagnoses.0.progression_or_recurrence,diagnoses.0.treatments.0.therapeutic_agents,diagnoses.0.treatments.0.treatment_or_therapy,diagnoses.0.treatments.1.therapeutic_agents,diagnoses.0.treatments.1.treatment_or_therapy,id,primary_site
0,5338d435-68fb-4f0d-a3e6-c843f703f75f,,not reported,not reported,,no,,no,5338d435-68fb-4f0d-a3e6-c843f703f75f,Kidney
1,22b6724c-a59f-4796-8166-992253e8caf1,,not reported,not reported,,not reported,,not reported,22b6724c-a59f-4796-8166-992253e8caf1,Kidney


In [None]:
#so the vital status is located in a text file that contains all of the clinical information for all the patients in that project- file name for the first case = nationwidechildrens.org_clinical_patient_kirc.txt 
#drugs for each patient are located in another text file - file name for first case = nationwidechildrens.org_clinical_drug_kirc.txt
#the rna-seq file for this case is called - 	d7984046-2ea2-4b2e-ba78-2909ed46f649.FPKM.txt.gz

In [15]:
#note for later from file search
# cases.case_id in ["set_id:AXD11pcSNya3Dok10xD9"] 
# and files.analysis.workflow_type in ["HTSeq - FPKM"]
# and files.data_format in ["txt"] and files.data_type 
# in ["Gene Expression Quantification"] and
# files.experimental_strategy in ["RNA-Seq"]

SyntaxError: invalid syntax (<ipython-input-15-acf02c292740>, line 3)