In [2]:
import requests
import json
import re
import os


In [3]:
# get file IDs and matching metadata in 

files_endpt = "https://api.gdc.cancer.gov/files"

# filtering for cases that are primary site pancreas, and have methylation data
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ["pancreas"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_category",
            "value": ["dna methylation"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_type",

            ### "Masked Intensities" selects for .IDAT raw methylation files
            ### "Methylation Beta Values" selects for .txt beta arrays files
            "value": ["Methylation Beta Value"]
            }
        }
    ]
}

### could add more data catagories here (bmi, sex... ) https://docs.gdc.cancer.gov/API/Users_Guide/Appendix_A_Available_Fields/#file-fields
fields = [
    "file_id",
    "file_name",
    "platform",

    "cases.case_id",

    "cases.diagnoses.created_datetime",

    "cases.demographic.vital_status",
    "cases.demographic.days_to_birth",
    "cases.demographic.days_to_death",

    "cases.samples.sample_id",
    "cases.samples.created_datetime"
    ]
fields = ",".join(fields)

params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "JSON",
    "size": "1000"
    }

response = requests.get(files_endpt, params = params)


In [18]:
data_endpt = "https://api.gdc.cancer.gov/data"

ids = [
    "b658d635-258a-4f6f-8377-767a43771fe4",
    "3968213d-b293-4b3d-8033-5b5a0ca07b6c"
    ]

params = {"ids": ids}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"
                            })

response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

save_name = "/Users/zacksiegfried/Documents/methylspan/beta_arrays/" + file_name
with open(save_name, "wb") as output_file:
    output_file.write(response.content)