In [2]:
import requests
import json
import re


In [3]:
# get file IDs and matching metadata in 

files_endpt = "https://api.gdc.cancer.gov/files"

# filtering for cases that are primary site pancreas, and have methylation data
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ["pancreas"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_category",
            "value": ["dna methylation"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_type",

            ### "Masked Intensities" selects for .IDAT raw methylation files
            ### "Methylation Beta Values" selects for .txt beta arrays files
            "value": ["Methylation Beta Value"]
            }
        }
    ]
}

### could add more data catagories here (bmi, sex... ) https://docs.gdc.cancer.gov/API/Users_Guide/Appendix_A_Available_Fields/#file-fields
fields = [
    "file_id",
    "file_name",
    "platform",

    "cases.case_id",

    "cases.diagnoses.created_datetime",

    "cases.demographic.vital_status",
    "cases.demographic.days_to_birth",
    "cases.demographic.days_to_death",

    "cases.samples.sample_id",
    "cases.samples.created_datetime"
    ]
fields = ",".join(fields)

params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "JSON",
    "size": "1000"
    }

response = requests.get(files_endpt, params = params)


In [6]:
data_endpt = "https://api.gdc.cancer.gov/data"

ids = [
    "b658d635-258a-4f6f-8377-767a43771fe4",
    "3968213d-b293-4b3d-8033-5b5a0ca07b6c"
    ]

params = {"ids": ids}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"
                            })

response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

save_name = "/Users/zacksiegfried/Documents/methylspan/beta_arrays/tests/" + file_name

print(response.content)
with open(save_name, "wb") as output_file:
    output_file.write(response.content)

b'\x1f\x8b\x08\x08\x1ehmd\x02\xffgdc_download_20230524_012758.070123.tar\x00\xec\xb7uL\x1c^\x10\x06\x88{\xb1\x02\x85\xe2\xee\xee^Z(.\xa5\xb0\xb8\xb3\xb8k)\xeeP\xdc\xa5\xb8\xcb\xe2\xeeZ\xdc\xdd\xddaYtq\xf6~w\xc9\xe5\xe4\xaf\xbb\xe4\xeer\x97\xdc\xf7\xfe\x98\xc9\xcb$\xef\x9b7\xdf$3\xcaR*\xf2\xdfe~jp\xb8\xfdr\x83\xfb\xbf\t\\\xffA\x80\x8f\xef\x7f\xb2\xff\xe1\x7fo\xb9\xf8\x05\x04\xe1\xb8\xf9\xf8xyy\x04\xf8\xb8x\x05\xe0\xb8\xb8\xb9\xb9\x05\xb9\xe0\xa8\xb8\xe0\xfe\x1f\x80\xbb\xab\x9b\x89\x0b\x15\xd5\xff%I\xfe\xaf\x93\xfb\xff\x08\xac\xcd\xd1-\xac\xed\x80\x0e&\xf6@t{s~tW\xeb\xdf@\xf4\xff\xbe\xc4\r\x88\xc1+, \xc4\xc3\xcdk\xcen\xca#\xcc\xcb\xceg\xfa\x9f\'\xc4\xc5\xcb\xcb\xceo\xcao\xc2ef\xc2%h*`\x86\xfe\x7f$\x88\x93\x97\xd7\x02h\xc1\'`\xc1n\xc6\xc3\'\xc4\xceg\xce/\xc4nj*\x00d\xe75\xe13\xe1\xe7\xe7\x11\xe4\xe1\xe5\xe5\xe3\xf8\xae\xa6\xa8\xfc?\xaa\x90\xc3\xf27:\x9f\xa0\x19\xb7\x85\xb0\xb0\x197\x8f\xb99\xb7\x107\xd0\\\xc0L\x88\x8f\x9f\xcf\xc4\xec?\x95p\x9b\np\xa3\xf3\xf3\xf0\xf3\x08\x0b\xa2{\x98\xd8Y\

In [9]:
import sys

sys.path.insert(0, '/Users/zacksiegfried/Documents/methylspan')
import MethylDataFetch


def methylDFPopulationMethod(primary_site):
    """"""

    # get all releavent files and compile into uuid_list
    uuid_list = []
    for file in MethylDataFetch.getMethylMetaData(str(primary_site)):   ### change if moving function elsewhere
        uuid_list.append(file['id'])

    print(uuid_list)

    # requests info
    data_endpt = "https://api.gdc.cancer.gov/data"


    for i in uuid_list:

        params = {"ids": str(i)}

        response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"
                            })
        
        file_name = re.findall("filename=(.+)", response_head_cd)[0]

        save_name = "/Users/zacksiegfried/Documents/methylspan/beta_arrays/tests/" + file_name




methylDFPopulationMethod('pancreas')


['827f32ab-6ef3-4e47-b912-6fd93f79dc48', '5066930d-7fb0-4927-8853-f2b36e0b3efb', 'b2fb76bc-1f61-4e17-aff3-3b038c780766', 'ab766fee-4574-4642-9cd0-12d400789abc', 'c0746719-52db-4f58-ab6d-2a5cbee5e175', 'd0bd5a24-b496-4373-83c1-c49cd45d37c7', 'ede24b00-d834-41a0-96b3-ab3a0a704f17', 'f2263b1a-5ce4-4c0a-9720-17271a4e53ea', 'aaaa85ca-0ec8-4bf7-a85a-57634ad920ab', '05fac380-d3b8-4134-9ae9-5c980f34a943', '6ab2856e-1a0d-4dfe-aa42-48af8b8f5c5c', '66d2a138-d1af-4f53-88cd-d11b236ad793', '34551977-a300-461d-92ae-fb6cf05a24a4', 'a70e2e88-6a14-4c54-8211-05970aa5121a', '2bf8b71c-be12-484d-ba80-7dee5eb9e3a4', '4b1b84b1-144b-4fb7-afcb-2887c5bf2839', 'fa27fdd0-cfd2-44f7-b3d8-19db9de8c42b', '96fcf55e-af6f-4dcc-94b4-8709a19a8a5f', 'f8186e89-50ea-4807-94d2-182f5e88c92c', 'b990e885-61b7-4dc4-a0f8-70f682e77738', '7b424eb3-7f5e-4c09-8a84-3d5b00744f03', '116fd928-357e-4f50-98db-b0d8fa8a68ca', 'adaff1bd-9778-4697-b9df-a60274684495', 'ada3a7ad-6a1e-4691-8991-075986e7f242', '453d6db0-bfe0-4bb5-a8e1-e786af09f635',