In [1]:
#Checa o status do Servidor

import requests
status_endpt = "https://api.gdc.cancer.gov/status"
response = requests.get(status_endpt)
display(response.content)

b'{\n  "commit": "22932f557e607b117bb1e6af75729ac1ef7417d4",\n  "data_release": "Data Release 13.0 - September 27, 2018",\n  "status": "OK",\n  "tag": "1.15.1",\n  "version": 1\n}'

In [2]:
#COMPLEX FILTERS
#EXTRAIR OS ID'S DE CADA ARQUIVO

import requests
import json
import pandas as pd
from io import StringIO

fields = [
    "file_name",
    "cases.submitter_id",
    "cases.samples.sample_type",
    "cases.disease_type",
    "cases.project.project_id"
    ]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/files"

# Filtros para selecionar Breast Cancer, em Females, sobre RNA-Seq e arquivos .txt
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Breast"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.demographic.gender",
            "value": ["female"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        }
    ]
}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "4000"
    }

# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

#Transformo de bytes para String
s=str(response.content,'utf-8')

#Transformo a String em um Data Frame pra poder extrair as id's de cada estudo
data = StringIO(s) 
df=pd.read_csv(data, sep='\t', lineterminator='\r')
#print(df.id.head(15).tolist())
display(df.id)

0       d094f341-9b5b-4f34-8321-f6d97d31a9b3
1       89b6b8b5-321a-408f-9408-1b862c71ba2a
2       59c647c4-8172-4365-a15e-e95ab2100ae6
3       55592256-af90-46ec-bd61-c38f6eaa171c
4       0c42df5b-c1a7-4ad7-b32c-96bf31c8f407
5       305dbc71-2339-48af-a890-c5810cb37ac7
6       7d62fa5a-9df0-42eb-8111-989b15862d3f
7       fdbac226-ffe4-427b-b9aa-d6859c0c6cf5
8       e21fa645-230b-433b-bd54-ad4182971285
9       4a56a3ce-48c7-4640-98b6-964b5a905396
10      e9bcc089-a19c-4bd7-9950-fb4724574ed9
11      4c164548-16d9-4241-b82f-fa712815129e
12      d6e21b6b-f661-4ba0-9674-5fcd5399d89b
13      5a4551ed-147a-44e1-b155-e0e9f8c29298
14      1fb17a5b-367a-4d30-8b8b-a58d51182459
15      187f3eef-fc49-4f48-b8ec-fd9cd33f1ef8
16      5f693ba5-3845-4231-895b-c431db35452b
17      c66bfdc3-873c-413b-b6d6-495f3b43c0e9
18      d7684a62-120a-4598-b736-d152b55763f4
19      8e665c9e-cef0-4c3e-a0ba-260ed5fd412e
20      0f5e8e23-81de-4c14-b413-6e06a29fd096
21      cf5b6af9-ec8b-48ef-b799-14fe78be4fdc
22      ed

In [13]:
#DOWNLOAD

import requests
import json
import re

#Quantidade de arquivos a serem baixados
ini = 0
fim = 999
data_endpt = "https://api.gdc.cancer.gov/data"

#Ele recebe uma lista de id's entao e so usar o dataFrame.tolist[ini:fim] para escolher os arquivos a serem baixados
#OBS.: A quantidade a ser baixada e meio inconsitente, uma vez consegui baixar 100 de uma vez, outras vezes a minha rede caia
#sempre que tentava baixar uma quantidade muito grande, creio que numa rede robusta seja possivel baixar todos os arquivos de uma
#unica vez, se nao, so criar um laco para baixar em pedaços
ids = (df.id.tolist())[ini:fim]

params = {"ids": ids}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"
                            })

response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

with open(file_name, "wb") as output_file:
    output_file.write(response.content)
print("Download complete: ",ini," to ",fim)

Download complete:  0  to  999


In [15]:
#Extrai o lote baixado
#Esse lote resulta em varias pastas com mais arquivos
#Abre cada uma das pastas e extrai o conteudo dos arquivos .gz que tem em cada uma e coloca em outra pasta

import tarfile
import os
import glob
import re
import gzip
import shutil

#Encontra todos os diretorios que possuem algum arquivo .gz no caminho atual
def get_files(base_dir, extension):
    return glob.iglob(f"{base_dir}/**/*.{extension}", recursive=True)

#Diretorio onde tem o vai extrair o lote de arquivos
files_path = "./files"
os.makedirs(files_path)

#Abre o lote baixado
file = tarfile.open(file_name, 'r')

#Extrai todos os arquivos do lote baixado no files_path
file.extractall(files_path)
os.makedirs("./text")

#Chama a funcao de buscar os arquivos .gz e para cada diretorio retornado
for name in get_files(files_path, "gz"):

    #Abre o arquivo gzip, le o conteudo e escreve em um .txt que fica noutro diretorio
    gz_file = gzip.open(name, 'rt', encoding='utf-8')
    s = gz_file.read()
    gz_file.close()
    txt_name = os.path.basename(name)
    print(txt_name[:-7])
    txt_file = open("./text/{}.txt".format(txt_name[:-7]),'w') 
    txt_file.write(s) 
    txt_file.close()

shutil.rmtree(files_path)
print("DONE")



b2a6c9e3-65eb-43bd-849a-5829007379ab.htseq.co
4bf2bea7-00ef-485c-b9e5-035f0ea62b1d.FPKM
3ab0577a-0d95-4f3b-9ec8-850820c5ecb3.FPKM-UQ
8e5a3c88-35f5-4620-86d7-53c014a2687b.FPKM-UQ
2eeb46a1-0422-4a77-b58d-3b7d57d51b44.htseq.co
3c8923de-0d68-4137-a5d9-657a77cac893.FPKM-UQ
39d6716f-d898-494e-9f41-7972de1e6670.htseq.co
9ce4048c-b607-4677-b874-647e9c29785d.FPKM
ffd7b10e-8c0c-4aeb-81a0-b060108d57fa.FPKM
f828b2e0-2ad4-4669-9f1e-2ef2dd74706d.htseq.co
b44cdb13-1812-4bf6-825e-edae5bc5376a.FPKM
93c46e63-3686-4883-a9de-7606ddfc11f0.FPKM
4fe7951d-87fd-49b7-91d6-1d455415daa1.htseq.co
2d721010-840f-4ca7-b41c-6792385b410d.FPKM-UQ
03895e36-d4e0-4849-a0f1-26550373338d.htseq.co
4aa06322-99a1-49e8-ad18-c92cb676da51.FPKM
6999d309-8502-49ee-8d80-9d0bde00081f.htseq.co
b02d1d2d-13e3-4246-ba2f-ae22d934c776.htseq.co
79c1a9c6-a12e-4e12-8acd-52a8ea0f9dea.htseq.co
f828b2e0-2ad4-4669-9f1e-2ef2dd74706d.FPKM
e7f4d11c-520d-4682-aa1b-5cf821a49c2c.htseq.co
3a04fb30-378c-48a0-a02e-85cf81b31b43.htseq.co
5d6adefb-ceda-4ef2-a

379553b2-13c2-4543-a9c3-9616fc249730.htseq.co
87ba691e-883a-4e45-9649-ab80e53c9243.FPKM
47af78ae-c41a-401f-a88b-d7e35831f816.htseq.co
bceffa81-996d-4775-a452-23b31a25daf9.FPKM
6c6d7c1d-7a53-4877-8cbe-f50a26e7156b.FPKM-UQ
7e7b4326-0563-466d-90e7-47edae5f6961.FPKM-UQ
157d34d0-1e1b-4004-9141-34bfe201788c.FPKM-UQ
2eeb46a1-0422-4a77-b58d-3b7d57d51b44.FPKM
8b178cb1-d22e-4657-80c6-d7efcddf43a6.htseq.co
a32c6a56-997e-46b6-8da9-fcfe24aafc08.FPKM
02939ecf-77d0-49fc-a04b-8fd5334b44da.htseq.co
6a818968-d75b-44b5-8b1e-53ff8274ea92.FPKM
1119c89a-a4db-4962-8578-997bf19b7b23.htseq.co
592f19d7-8afb-4db6-9409-24ded8862985.FPKM
78e16228-2d9f-4e19-bbd2-b0bb85e8faa2.FPKM
71857763-8175-4a80-8bf4-ebf3c5018151.FPKM
62a17809-7816-47cf-959d-f00e5ea0c3b4.FPKM-UQ
64c6dc6b-5fb3-48fe-8103-ee7cd86b989d.FPKM-UQ
ce137517-0320-46dd-b4e0-82c02773ba97.FPKM
7f2cf950-b5e1-4a01-a44b-88a4e3303233.htseq.co
1d7afc91-311c-4f26-9be0-b76ac284ee96.FPKM
381e2cf8-6fd4-4561-a881-683bfed6b1f8.htseq.co
a5dc521e-bee4-489c-8679-d4b90a327

95e7cabf-16bf-43b8-9478-84ef60df13dd.FPKM
111d998f-4133-436a-9990-64cb293a986e.FPKM
d806b4fb-807b-4d66-b17f-c2e381675e4f.FPKM-UQ
8e5a3c88-35f5-4620-86d7-53c014a2687b.FPKM
5018e9b3-06a9-47f9-b6c9-73d6dbaf188c.FPKM-UQ
6360d281-1d89-43a0-aaea-46a46533ae38.htseq.co
ccdbc757-4bf2-4132-bc03-dced7d6c2c71.FPKM-UQ
62a17809-7816-47cf-959d-f00e5ea0c3b4.htseq.co
4d186011-bf83-49da-85b9-a42337fa1a23.htseq.co
148d950b-4202-4f3f-be15-84735cd08a48.htseq.co
5958e543-9a75-4144-9f93-9daa1cc14190.htseq.co
b07b373c-29e1-4e88-a3a9-783568cc12eb.FPKM
d65401e7-2bde-4b7e-b901-c173dd3cd33d.FPKM
f12fbfdf-efd4-462a-a34c-8c01947bfb78.FPKM-UQ
c6e9cad6-fa27-439a-a5eb-eab1bc1c6a71.htseq.co
148d950b-4202-4f3f-be15-84735cd08a48.FPKM
f1afa28b-fac9-4e2e-b447-36097ef45a0b.htseq.co
c14af0e0-81e9-43e7-b08d-b751194e35ef.FPKM
ab1f0023-6b52-41b6-9867-87599e4d6d60.FPKM
1b70d7d9-1d4f-4912-820a-e7ccaa0c9038.FPKM-UQ
1885eb4c-2149-4746-9c7e-620125b03d94.FPKM
9e62a3e2-20f0-4de4-aec0-317369ce4798.htseq.co
fd168f37-df3d-4cf2-881e-c90c6

3b187f2d-fa67-46a4-ad60-0d26375a9632.htseq.co
6fdbbe35-6392-4100-8f96-0e0d02257b73.FPKM
7a284ab1-3849-4c83-ae30-86cf1136f641.FPKM-UQ
f95c523a-cfb0-4b7e-9a66-48dcf9d24515.FPKM-UQ
98921931-2f32-470d-97ae-824716d1c034.htseq.co
82bfc74a-cb0f-4966-b53e-c7b535ae7f23.htseq.co
2edcaaa7-63b4-40b4-abbe-5d7a84012e60.FPKM-UQ
b44cdb13-1812-4bf6-825e-edae5bc5376a.FPKM-UQ
5ab6232c-5cba-47ca-af29-3799b0263a58.htseq.co
220c7593-ae84-4db4-9ce5-c6372fd696e3.FPKM
a4cc111b-f49a-4aea-91ed-10e31c60d522.FPKM-UQ
a5de7a1d-cbce-4e2f-99bd-9b72e9f9405a.htseq.co
aee34477-7bfd-46c3-88a8-f3e7338a8444.FPKM-UQ
d806b4fb-807b-4d66-b17f-c2e381675e4f.FPKM
19f52003-78b1-41db-af49-65536c538ce3.htseq.co
edc3bfaa-aad7-44d5-8aa0-bb5a311b24c5.FPKM-UQ
c14ae3ed-5d43-452c-8955-631a8c62660d.FPKM
bdceb6c5-77d1-43e6-9046-1340502c2a60.htseq.co
318901b4-4f1c-4017-8113-5115747c6a67.FPKM
3ca8233e-434b-477c-8683-b1f770cedc63.htseq.co
ca5d0721-99cf-4b7c-b708-6acea2ae86bd.FPKM-UQ
18fbee33-4731-41f4-bdc9-6cf90952d370.FPKM-UQ
c77a24cf-b389-4c7

022a665e-1ed2-4e37-a6cd-911ccdb1dfcf.htseq.co
cf4d50b3-f391-43ee-8887-10de9bca126e.FPKM-UQ
742a6c93-affc-4447-8951-4447a5f439c2.htseq.co
e6981360-42ea-402a-aead-f6f2b31ef30c.FPKM
e1353694-9f60-442a-a164-5b2aca82419a.htseq.co
a4c7b6e8-ed30-42b2-9daa-0cf0b9d59259.htseq.co
b76eefac-da30-4ee4-9670-4afc45f86b4b.FPKM
d4d6abaa-9134-42ac-9adf-2045f747abb4.FPKM-UQ
deaa56c1-8031-4c65-b3bc-1228407e06ea.FPKM
63a39ca4-6ea7-468e-b726-b3b8ce17d7b1.htseq.co
22c16f46-950f-4389-bb16-b5cec4d8b94e.FPKM
b8001d68-74fb-46a4-b0c5-388387479685.htseq.co
a5be0ae4-accd-4fa5-84b7-fa8377054755.htseq.co
00511204-3512-4a5e-b664-60271e968903.htseq.co
7af2075c-0386-4971-ae25-375330ef6cec.htseq.co
29c4a2e1-2e48-4f4a-9ad6-9b0a7c04b8e9.htseq.co
30621023-91a3-4b13-bc10-922491b5f9d3.FPKM
8247ad98-4681-438c-880e-ff2a89b5a094.htseq.co
d4675ebb-2444-4e2f-ade2-5c5af9882399.FPKM
a09e07b7-9834-465a-8e85-0cec02fc1881.FPKM-UQ
baa3cdc1-1ee8-4aab-80f9-dafa35cc5fbf.FPKM-UQ
0b6603e7-a619-40d0-8af8-ddc7d30ffd7e.FPKM-UQ
0b5a79f8-be5b-4b7

78f2dfc0-9452-4547-b9a9-eb9dc920a4a9.htseq.co
ca223822-8759-42a7-8058-ccd90a02320f.FPKM-UQ
95c75be7-d55d-4d74-9179-9be223230a5b.FPKM-UQ
31024f61-9191-4a91-a8fd-a8f4b63e6da6.FPKM-UQ
3fb92a97-b53b-4f00-88b2-958b0ad2b46d.FPKM-UQ
5d6adefb-ceda-4ef2-a6b9-8077fd618ae6.FPKM-UQ
ec57a9a3-90f1-4559-a241-d647a9ce37e8.htseq.co
fd39d52c-5ae1-429c-b85c-96321db417d3.FPKM
609420eb-0650-4187-8b48-cb1ea5fa0d8d.FPKM
d6b9e128-8f20-43a1-9d16-b3e2b3d60e34.htseq.co
a0c9781b-0c1e-4dc0-a3f6-84e938b07641.htseq.co
0a2c1866-d6e1-4add-bcb0-b297ba1394ac.FPKM
64c6dc6b-5fb3-48fe-8103-ee7cd86b989d.htseq.co
b76eefac-da30-4ee4-9670-4afc45f86b4b.htseq.co
b41174a5-4db8-438c-a9fa-6da8c08a9c75.FPKM-UQ
aa42dd76-efb1-4fb7-9922-cf54540844e7.FPKM-UQ
a4c7b6e8-ed30-42b2-9daa-0cf0b9d59259.FPKM
d65401e7-2bde-4b7e-b901-c173dd3cd33d.FPKM-UQ
44caf0b5-d05f-49fd-b8ec-c32d0003c5f4.FPKM-UQ
ba65f95e-4a58-4378-925b-60f864c4f5de.FPKM
90d19c5c-f80b-454b-b776-99d13f01617e.FPKM-UQ
1f853089-c40d-4bee-9833-837f327b275a.FPKM-UQ
39ea216a-6bff-41bc-

In [None]:
#CRIANDO DATAFRAMES

#Listando os arquivos no diretorio
files = os.listdir("./text")

#Cria um dataframe com todos os arquivos de texto que possuem "htseq.co" o mesmo pode ser feito pra fpkm e fpkm-uq
htseq = pd.concat([pd.read_csv("./text/{}".format(item), names=[item[:-4]], index_col=0, sep='\t') for item in files if "FPKM" in item], axis=1)
display(htseq)

In [12]:
display(df)

Unnamed: 0,cases.0.samples.0.sample_type,cases.0.disease_type,file_name,cases.0.submitter_id,cases.0.project.project_id,id
0,\nPrimary Tumor,Ductal and Lobular Neoplasms,36725a42-52bc-4e9a-bb7c-728fbb8b1d19.FPKM-UQ.t...,TCGA-5L-AAT1,TCGA-BRCA,d094f341-9b5b-4f34-8321-f6d97d31a9b3
1,\nPrimary Tumor,Ductal and Lobular Neoplasms,6f618400-b914-4103-bf46-9c026b470aa0.FPKM-UQ.t...,TCGA-C8-A26V,TCGA-BRCA,89b6b8b5-321a-408f-9408-1b862c71ba2a
2,\nPrimary Tumor,Ductal and Lobular Neoplasms,e023c283-afcf-4334-b741-3dce8e98b0f6.htseq.cou...,TCGA-AC-A3W5,TCGA-BRCA,59c647c4-8172-4365-a15e-e95ab2100ae6
3,\nPrimary Tumor,Ductal and Lobular Neoplasms,eaee0d82-cd7e-47a5-859f-59b7bf5ad6a0.FPKM.txt.gz,TCGA-A7-A13G,TCGA-BRCA,55592256-af90-46ec-bd61-c38f6eaa171c
4,\nPrimary Tumor,Ductal and Lobular Neoplasms,afe5f01a-b3c1-4d24-94cf-27ec0894e308.FPKM-UQ.t...,TCGA-A7-A26I,TCGA-BRCA,0c42df5b-c1a7-4ad7-b32c-96bf31c8f407
5,\nPrimary Tumor,Ductal and Lobular Neoplasms,6c6d7c1d-7a53-4877-8cbe-f50a26e7156b.FPKM-UQ.t...,TCGA-A8-A08C,TCGA-BRCA,305dbc71-2339-48af-a890-c5810cb37ac7
6,\nSolid Tissue Normal,Ductal and Lobular Neoplasms,6598740f-64d5-4b20-af26-dac644efc45f.htseq.cou...,TCGA-E9-A1N4,TCGA-BRCA,7d62fa5a-9df0-42eb-8111-989b15862d3f
7,\nPrimary Tumor,Ductal and Lobular Neoplasms,c22879b5-4183-4c3c-804b-f6cef1df617e.FPKM.txt.gz,TCGA-A7-A26E,TCGA-BRCA,fdbac226-ffe4-427b-b9aa-d6859c0c6cf5
8,\nPrimary Tumor,Ductal and Lobular Neoplasms,44f52c83-3eb1-4f91-a885-599fb960049c.FPKM-UQ.t...,TCGA-PL-A8LX,TCGA-BRCA,e21fa645-230b-433b-bd54-ad4182971285
9,\nPrimary Tumor,Ductal and Lobular Neoplasms,f125d4a1-cb15-4729-9318-433c39c685d7.FPKM-UQ.t...,TCGA-AR-A0TY,TCGA-BRCA,4a56a3ce-48c7-4640-98b6-964b5a905396
