In [3]:
import requests
import json
import re

# get file IDs and matching metadata in 

files_endpt = "https://api.gdc.cancer.gov/files"

# filtering for cases that are primary site pancreas, and have methylation data
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ["pancreas"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_category",
            "value": ["dna methylation"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_type",

            ### "Masked Intensities" selects for .IDAT raw methylation files
            ### "Methylation Beta Values" selects for .txt beta arrays files
            "value": ["Methylation Beta Value"]
            }
        }
    ]
}

### could add more data catagories here (bmi, sex... ) https://docs.gdc.cancer.gov/API/Users_Guide/Appendix_A_Available_Fields/#file-fields
fields = [
    "file_id",
    "file_name",
    "platform",

    "cases.case_id",

    "cases.diagnoses.created_datetime",

    "cases.demographic.vital_status",
    "cases.demographic.days_to_birth",
    "cases.demographic.days_to_death",

    "cases.samples.sample_id",
    "cases.samples.created_datetime"
    ]
fields = ",".join(fields)

params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "JSON",
    "size": "1000"
    }

response = requests.get(files_endpt, params = params)


In [13]:
data_endpt = "https://api.gdc.cancer.gov/data"

ids = [
    "b658d635-258a-4f6f-8377-767a43771fe4"
    ]

params = {"ids": ids}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"
                            })

response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

save_name = "/Users/zacksiegfried/Documents/methylspan/beta_arrays/tests/" + file_name

with open(save_name, "wb") as output_file:
    output_file.write(response.content)

In [None]:
### methyl beta array function build

import sys
import pandas as pd
import os
import requests
import json
import re
import tarfile
import shutil

sys.path.insert(0, '/Users/zacksiegfried/Documents/methylspan')
from MethylDataFetch import getMethylMetaData


def methylDFPopulationMethod(primary_site):
    """Returns dataframe with cpg index as index column and each file_id as a column with beta values"""

    # Creates directory for temporary file storage
    script_dir = "/Users/zacksiegfried/Documents/methylspan"
    try:
        os.mkdir(script_dir + "/data")
    except OSError as error:
        print(error)

    # Get all releavent files and compile into uuid_list
    uuid_list = []
    for file in getMethylMetaData(str(primary_site))[:10]:
        uuid_list.append(file['id'])

    params = {"ids": uuid_list}

    response = requests.post("https://api.gdc.cancer.gov/data",
                                data = json.dumps(params),
                                headers={
                                    "Content-Type": "application/json"
                                        })

    response_head_cd = response.headers["Content-Disposition"]
    file_name = re.findall("filename=(.+)", response_head_cd)[0]

    # save zip with response 
    save_name = str(script_dir + "/data/" + file_name)
    with open(save_name, "wb") as output_file:
        output_file.write(response.content)

    # extracting zip
    file = tarfile.open(save_name)
    file.extractall(str(script_dir + "/data"))
    file.close()

    # removes the unneccessary files
    os.remove(save_name)
    os.remove(str(script_dir + "/data/MANIFEST.txt"))
    try:
        os.remove(str(script_dir + "/data/.DS_Store"))
    except OSError as e:
        print(e)

    # getting saved files names
    new_file_list = []
    new_dir_list = []
    for (root, dirs, files) in os.walk(str(script_dir + "/data")):
        for d in dirs:
            new_dir_list.append(d)
        for f in files:
            new_file_list.append(f)


    df = pd.DataFrame()

    for i, j in zip(new_dir_list, new_file_list):

        # reading in file
        read_file = pd.read_table(str(script_dir + "/data/" + i + "/" + j), header = None)
        read_file.rename(columns = {0:'cpg',1:str(j.split('.')[0])}, inplace = True)
        
        # merging read file into main df
        if df.empty != True:
            df = df.merge(read_file, on='cpg')
        else:
            df = read_file

        shutil.rmtree(str(script_dir + "/data/" + i))

    df = df.set_index('cpg')
    frame_dict = {'data':df, 'primary_site':str(primary_site)}
    
    return(frame_dict)



print(methylDFPopulationMethod('thymus'))


In [None]:
sys.path.insert(0, '/Users/zacksiegfried/Documents/methylspan')
from MethylDataFetch import getMethylBetaArrays

frame = getMethylBetaArrays('thymus')['data']

In [None]:
### Mapping genes onto CpG location and formatting

from MethylMapping import methylMap450
from MethylMapping import averageGene

mapped_frame = methylMap450(frame['data'].transpose())
av_mapped_frame = averageGene(mapped_frame)
av_mapped_frame.columns.names = ['gene']

av_mapped_frame = av_mapped_frame.transpose()

av_mapped_frame['mean'] = av_mapped_frame.mean(axis=1, skipna=True, numeric_only=True)
print(av_mapped_frame['mean'])

In [None]:
import sys
sys.path.insert(0, '/Users/zacksiegfried/Documents/methylspan')
from AnalysisFunctions import compareGeneMethylContent2

df = compareGeneMethylContent2(['eye and adnexa', 'thymus'])
print(df)

In [None]:
### EXAMPLE SURVIVAL ANALYSIS SET UP WITH NO DOWNLOAD METHOD (needs work)

import sys
sys.path.insert(0, '/Users/zacksiegfried/Documents/methylspan')
import MethylDataFetch
import MethylFormatting

methyl_array = MethylDataFetch.getMethylBetaArrays('thymus')
methyl_array = methyl_array['data'].transpose()

full_meta_data = MethylDataFetch.getMethylMetaData(methyl_array['primary_site'])
formatted_meta_data = MethylFormatting.metaDataFormat(full_meta_data)