In [1]:
#Checa o status do Servidor

import requests
status_endpt = "https://api.gdc.cancer.gov/status"
response = requests.get(status_endpt)
display(response.content)

b'{\n  "commit": "8acb9697bd3e944def954f8d1c0b5c22a0fbc8fa",\n  "data_release": "Data Release 12.0 - August 23, 2018",\n  "status": "OK",\n  "tag": "1.15.1",\n  "version": 1\n}'

In [2]:
#COMPLEX FILTERS
#EXTRAIR OS ID'S DE CADA ARQUIVO

import requests
import json
import pandas as pd
from io import StringIO

fields = [
    "file_name",
    "cases.submitter_id",
    "cases.samples.sample_type",
    "cases.disease_type",
    "cases.project.project_id"
    ]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/files"

# Filtros para selecionar Breast Cancer, em Females, sobre RNA-Seq e arquivos .txt
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Breast"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.demographic.gender",
            "value": ["female"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        }
    ]
}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "4000"
    }

# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

#Transformo de bytes para String
s=str(response.content,'utf-8')

#Transformo a String em um Data Frame pra poder extrair as id's de cada estudo
data = StringIO(s) 
df=pd.read_csv(data, sep='\t', lineterminator='\r')
#print(df.id.head(15).tolist())
display(df.id)

0       e7a77848-0df8-4260-b7e9-df8587e9f9b7
1       1837ad2a-4edf-4d80-9050-f78115e54454
2       b9610459-bd3e-4d65-92cd-8eb34541f259
3       7061c88c-3a5b-4a6b-bf55-bcf20d89a6a7
4       21dd15c2-099d-422f-9f05-cfd1170d5bfc
5       a159a8ba-a1e7-4ab6-8820-d67a9c897457
6       c6921a78-f687-46b5-9de7-0603a4a0e99c
7       caaf288e-7672-46af-bc68-302afa0c6478
8       54dbdf55-a610-4967-808a-7c4e36178595
9       a62fbc8b-c366-4205-82c0-2a864a2eca87
10      048e5e34-ab11-43d6-9788-771502d99152
11      1d4fd1cb-6538-4325-bddc-2ced7f437f13
12      c56d71fe-e3cb-4c7f-b464-db7e8b26f71d
13      e05c95c8-bf99-4e0a-bcc6-07e9a5db3c3e
14      ad3b3339-8a4a-4a58-8439-7b82544acec8
15      dbaba045-5bf8-47b7-b883-ff97abe9a8df
16      bc43acfb-2c72-402b-9be9-d42139c182c1
17      ef184941-92be-4c12-9d56-125d605e9cce
18      6416384b-2b9d-473b-a7e6-016a1df4ed29
19      104d21c5-4d9e-4c81-9399-914c81faa8ad
20      1ffd88db-f671-4886-8daa-b291ecad58f3
21      f45b27f2-b1c3-46f1-8cbb-4f9a22676c18
22      82

In [3]:
#DOWNLOAD

import requests
import json
import re

#Quantidade de arquivos a serem baixados
ini = 0
fim = 200

data_endpt = "https://api.gdc.cancer.gov/data"

#Ele recebe uma lista de id's entao e so usar o dataFrame.tolist[ini:fim] para escolher os arquivos a serem baixados
#OBS.: A quantidade a ser baixada e meio inconsitente, uma vez consegui baixar 100 de uma vez, outras vezes a minha rede caia
#sempre que tentava baixar uma quantidade muito grande, creio que numa rede robusta seja possivel baixar todos os arquivos de uma
#unica vez, se nao, so criar um laco para baixar em pedaços
ids = (df.id.tolist())[ini:fim]

params = {"ids": ids}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"
                            })

response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

with open(file_name, "wb") as output_file:
    output_file.write(response.content)

In [10]:
#Extrai o lote baixado
#Esse lote resulta em varias pastas com mais arquivos
#Abre cada uma das pastas e extrai o conteudo dos arquivos .gz que tem em cada uma e coloca em outra pasta

import tarfile
import os
import glob
import re
import gzip
import shutil

#Encontra todos os diretorios que possuem algum arquivo .gz no caminho atual
def get_gz_files(base_dir):
    return glob.iglob(f"{base_dir}/**/*.gz", recursive=True)

#Diretorio onde tem o vai extrair o lote de arquivos
files_path = "C:/Users/lenovo i5/Documents/TCGA_API/files"
os.makedirs(files_path)

#Abre o lote baixado
file = tarfile.open('C:/Users/lenovo i5/Documents/TCGA_API/gdc_download_20180830_235355.731702.tar.gz', 'r')

#Extrai todos os arquivos do lote baixado no files_path
file.extractall(files_path)

#Chama a funcao de buscar os arquivos .gz e para cada diretorio retornado
for name in get_gz_files(files_path):

    #Abre o arquivo gzip, le o conteudo e escreve em um .txt que fica noutro diretorio
    gz_file = gzip.open(name, 'rt', encoding='ansi')
    s = gz_file.read()
    gz_file.close()
    txt_name = os.path.basename(name)
    print(txt_name)
    txt_file = open("C:/Users/lenovo i5/Documents/TCGA_API/text/{}.txt".format(txt_name[:-7]),'w') 
    txt_file.write(s) 
    txt_file.close()

shutil.rmtree(files_path)
print("DONE")



93c46e63-3686-4883-a9de-7606ddfc11f0.FPKM.txt.gz
79c1a9c6-a12e-4e12-8acd-52a8ea0f9dea.htseq.counts.gz
e7f4d11c-520d-4682-aa1b-5cf821a49c2c.htseq.counts.gz
e8201ee9-2e0a-4a8e-bc99-6e2482da919d.htseq.counts.gz
0211f2ea-a4f6-4a57-8f98-66bf0a327479.FPKM.txt.gz
c3ed7c7a-b80a-4de2-8dfb-b60bcf336375.FPKM.txt.gz
1c89ff2f-6e0a-4244-952b-1391dba1b815.htseq.counts.gz
afe5f01a-b3c1-4d24-94cf-27ec0894e308.FPKM-UQ.txt.gz
c1efb56c-bcb2-4f9d-a291-b2c05217de2a.htseq.counts.gz
3ec39b79-fd84-4006-b513-ea78ba8ba16c.htseq.counts.gz
462312a9-821f-4049-9712-b937358a60ce.FPKM.txt.gz
0f5282f4-e702-41f2-b93f-a1966203879f.FPKM.txt.gz
96833a09-14ce-4483-b17d-f4328e0b68f5.FPKM.txt.gz
bb12bb45-2c4a-46dc-98dd-9acbe2a0c4ee.FPKM.txt.gz
4f6e9a85-31c9-468b-b121-1e235f159010.FPKM.txt.gz
5525258b-a896-4735-bcfb-63db002189e3.FPKM-UQ.txt.gz
1943018e-c964-4fd1-b23a-92c1202257b0.FPKM.txt.gz
4ba6c212-ef90-4ab3-b551-35c05cb6df90.FPKM.txt.gz
5fbf674e-49f3-4af8-9c9c-20a528ad0684.FPKM-UQ.txt.gz
bb12bb45-2c4a-46dc-98dd-9acbe2a0c4ee

e6bec4f4-0b51-43ba-b62b-5af777712488.FPKM-UQ.txt.gz
2bb5a0f3-eaa6-4138-8bec-38c7c36c95fc.htseq.counts.gz
7487d66f-3ca5-4df9-ac9e-88f0af7838c3.FPKM-UQ.txt.gz
bd556315-a050-465f-9a41-bdebe7e3eb61.FPKM-UQ.txt.gz
ddf98c57-1c48-4277-8a49-c74f52141723.htseq.counts.gz
34dd2d8f-6b3c-42ec-94c6-2c3eb75e8b91.htseq.counts.gz
0f8ed930-950b-4bae-a02d-e598b74bd82b.FPKM-UQ.txt.gz
b493310b-77e6-43fb-96cd-ab56d12de273.htseq.counts.gz
a8a58442-78f5-4876-b25e-c04339eb6f26.htseq.counts.gz
6f618400-b914-4103-bf46-9c026b470aa0.FPKM.txt.gz
ad1b1ba2-a050-4d62-9e44-4dda1b68aa2e.FPKM-UQ.txt.gz
24d7f10a-ac34-4f7e-8424-d2fc0d32ac58.FPKM.txt.gz
b73518ee-287c-4130-9da2-61772aa195fc.FPKM.txt.gz
d7bff2a7-1ffe-495c-b297-3b734b688267.FPKM-UQ.txt.gz
c1f0e689-1d11-4f7d-8386-616efe367e52.htseq.counts.gz
9ae1330c-a197-416a-9d5f-3ca16b36ec90.FPKM.txt.gz
33dbd5a5-f760-491a-aef5-5ff8be13a8cd.htseq.counts.gz
eeff4d01-f161-423f-9d6d-d2d31cc69047.FPKM-UQ.txt.gz
8970e6da-5904-483d-8770-0ede67f98cf4.FPKM.txt.gz
249869f5-8812-410c-8

In [9]:
#STILL WORKING

from pathlib import Path
import pandas

def _reader(fname):
    return pandas.read_csv(fname, sep='\t', header=None, encoding='ansi')

folder = Path("text")
df = pandas.concat([
    _reader(txt)
    for txt in folder.glob("*.txt")
])

display(df)

Unnamed: 0,0,1
0,ENSG00000242268.2,0.000000
1,ENSG00000270112.3,0.000000
2,ENSG00000167578.15,10.751059
3,ENSG00000273842.1,0.000000
4,ENSG00000078237.5,2.929764
5,ENSG00000146083.10,5.158955
6,ENSG00000225275.4,0.000000
7,ENSG00000158486.12,1.324671
8,ENSG00000198242.12,95.606198
9,ENSG00000259883.1,0.436570
