In [1]:
#Checa o status do Servidor

import requests
status_endpt = "https://api.gdc.cancer.gov/status"
response = requests.get(status_endpt)
display(response.content)

b'{\n  "commit": "8acb9697bd3e944def954f8d1c0b5c22a0fbc8fa",\n  "data_release": "Data Release 12.0 - August 23, 2018",\n  "status": "OK",\n  "tag": "1.15.1",\n  "version": 1\n}'

In [2]:
#COMPLEX FILTERS
#EXTRAIR OS ID'S DE CADA ARQUIVO

import requests
import json
import pandas as pd
from io import StringIO

fields = [
    "file_name",
    "cases.submitter_id",
    "cases.samples.sample_type",
    "cases.disease_type",
    "cases.project.project_id"
    ]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/files"

# Filtros para selecionar Breast Cancer, em Females, sobre RNA-Seq e arquivos .txt
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Breast"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.demographic.gender",
            "value": ["female"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        }
    ]
}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "4000"
    }

# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

#Transformo de bytes para String
s=str(response.content,'utf-8')

#Transformo a String em um Data Frame pra poder extrair as id's de cada estudo
data = StringIO(s) 
df=pd.read_csv(data, sep='\t', lineterminator='\r')
#print(df.id.head(15).tolist())
display(df.id)

0       e7a77848-0df8-4260-b7e9-df8587e9f9b7
1       1837ad2a-4edf-4d80-9050-f78115e54454
2       b9610459-bd3e-4d65-92cd-8eb34541f259
3       7061c88c-3a5b-4a6b-bf55-bcf20d89a6a7
4       21dd15c2-099d-422f-9f05-cfd1170d5bfc
5       a159a8ba-a1e7-4ab6-8820-d67a9c897457
6       c6921a78-f687-46b5-9de7-0603a4a0e99c
7       caaf288e-7672-46af-bc68-302afa0c6478
8       54dbdf55-a610-4967-808a-7c4e36178595
9       a62fbc8b-c366-4205-82c0-2a864a2eca87
10      048e5e34-ab11-43d6-9788-771502d99152
11      1d4fd1cb-6538-4325-bddc-2ced7f437f13
12      c56d71fe-e3cb-4c7f-b464-db7e8b26f71d
13      e05c95c8-bf99-4e0a-bcc6-07e9a5db3c3e
14      ad3b3339-8a4a-4a58-8439-7b82544acec8
15      dbaba045-5bf8-47b7-b883-ff97abe9a8df
16      bc43acfb-2c72-402b-9be9-d42139c182c1
17      ef184941-92be-4c12-9d56-125d605e9cce
18      6416384b-2b9d-473b-a7e6-016a1df4ed29
19      104d21c5-4d9e-4c81-9399-914c81faa8ad
20      1ffd88db-f671-4886-8daa-b291ecad58f3
21      f45b27f2-b1c3-46f1-8cbb-4f9a22676c18
22      82

In [3]:
#DOWNLOAD

import requests
import json
import re

#Quantidade de arquivos a serem baixados
ini = 0
fim = 200

data_endpt = "https://api.gdc.cancer.gov/data"

#Ele recebe uma lista de id's entao e so usar o dataFrame.tolist[ini:fim] para escolher os arquivos a serem baixados
#OBS.: A quantidade a ser baixada e meio inconsitente, uma vez consegui baixar 100 de uma vez, outras vezes a minha rede caia
#sempre que tentava baixar uma quantidade muito grande, creio que numa rede robusta seja possivel baixar todos os arquivos de uma
#unica vez, se nao, so criar um laco para baixar em pedaços
ids = (df.id.tolist())[ini:fim]

params = {"ids": ids}

response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"
                            })

response_head_cd = response.headers["Content-Disposition"]

file_name = re.findall("filename=(.+)", response_head_cd)[0]

with open(file_name, "wb") as output_file:
    output_file.write(response.content)

In [10]:
#Extrai o lote baixado
#Esse lote resulta em varias pastas com mais arquivos
#Abre cada uma das pastas e extrai o conteudo dos arquivos .gz que tem em cada uma e coloca em outra pasta

import tarfile
import os
import glob
import re
import gzip
import shutil

#Encontra todos os diretorios que possuem algum arquivo .gz no caminho atual
def get_files(base_dir, extension):
    return glob.iglob(f"{base_dir}/**/*.{extension}", recursive=True)

#Diretorio onde tem o vai extrair o lote de arquivos
files_path = "./files"
os.makedirs(files_path)

#Abre o lote baixado
file = tarfile.open('./gdc_download_20180830_235355.731702.tar.gz', 'r')

#Extrai todos os arquivos do lote baixado no files_path
file.extractall(files_path)
os.makedirs("./text")

#Chama a funcao de buscar os arquivos .gz e para cada diretorio retornado
for name in get_files(files_path, "gz"):

    #Abre o arquivo gzip, le o conteudo e escreve em um .txt que fica noutro diretorio
    gz_file = gzip.open(name, 'rt', encoding='utf-8')
    s = gz_file.read()
    gz_file.close()
    txt_name = os.path.basename(name)
    print(txt_name[:-7])
    txt_file = open("./text/{}.txt".format(txt_name[:-7]),'w') 
    txt_file.write(s) 
    txt_file.close()

shutil.rmtree(files_path)
print("DONE")



d4d6abaa-9134-42ac-9adf-2045f747abb4.FPKM-UQ.txt.gz
97745843-751b-4437-82e0-31ed8c323e91.FPKM.txt.gz
ad778372-ad13-4d11-ac15-900de7d9637a.FPKM.txt.gz
f31566a9-ffde-40fa-b879-71827a3540a9.FPKM.txt.gz
d6bbcaba-61a7-4360-bc15-1426f8eafa82.htseq.counts.gz
6577de41-48a3-4933-94de-64553f13bf05.htseq.counts.gz
e7f4d11c-520d-4682-aa1b-5cf821a49c2c.htseq.counts.gz
5ab6232c-5cba-47ca-af29-3799b0263a58.htseq.counts.gz
c4489317-3e8a-4159-b32b-4cfde723359a.FPKM-UQ.txt.gz
ce85344b-0f16-4f96-b8da-af518e2adc71.htseq.counts.gz
c1c1111e-858a-438e-8bb6-90df4defcede.htseq.counts.gz
00511204-3512-4a5e-b664-60271e968903.htseq.counts.gz
67418961-d894-42a9-ae6a-21f41e6ca9bf.htseq.counts.gz
8677d42c-aef1-444d-b9a2-0dbcced0e886.FPKM-UQ.txt.gz
79f4c86a-f806-4fcb-ac89-db4e95a6eaad.FPKM-UQ.txt.gz
6c6d7c1d-7a53-4877-8cbe-f50a26e7156b.FPKM.txt.gz
fa49537b-edee-4d4f-823d-b6c574d33900.FPKM.txt.gz
be0d2e41-7373-4fa9-9c05-877e9ddcb88c.htseq.counts.gz
e740e34f-31fe-4fe4-8965-4e9ec5ec50fd.FPKM.txt.gz
de2ffea0-e172-45e2-aa

18fbee33-4731-41f4-bdc9-6cf90952d370.FPKM-UQ.txt.gz
2656413d-a5d2-4812-9ce9-a02c15ab04bd.FPKM.txt.gz
84bf2e9a-56a4-49ba-bbb9-ad931c77e117.htseq.counts.gz
82bfc74a-cb0f-4966-b53e-c7b535ae7f23.FPKM.txt.gz
0d38dab7-b983-4829-9a5b-d79f0fee6f26.FPKM-UQ.txt.gz
eab58571-a6f1-4bc4-aa01-0df9c4beb79f.FPKM.txt.gz
1d293e9f-7672-428c-be8d-3d180c8e8590.FPKM-UQ.txt.gz
82bfc74a-cb0f-4966-b53e-c7b535ae7f23.htseq.counts.gz
a6f657a3-439e-4afd-be04-413f2ed02828.htseq.counts.gz
7d229eb4-cdbd-4578-9e54-c7cbd7ef1378.htseq.counts.gz
ffd7b10e-8c0c-4aeb-81a0-b060108d57fa.htseq.counts.gz
f641a19d-8497-4bc7-9df1-5228adfd5334.FPKM.txt.gz
48436ca7-2b7f-4615-9c88-8ede9398d11a.FPKM-UQ.txt.gz
1c89ff2f-6e0a-4244-952b-1391dba1b815.htseq.counts.gz
62a17809-7816-47cf-959d-f00e5ea0c3b4.htseq.counts.gz
709670f6-cbb6-49ac-bc82-982be7eff36a.htseq.counts.gz
05eb3daf-d392-4ede-87d6-8f0aef8b6372.htseq.counts.gz
249869f5-8812-410c-89f8-2b7df632ac85.FPKM-UQ.txt.gz
8659f2c4-7492-46b3-9bbe-ea8ae7bbdd53.FPKM-UQ.txt.gz
0b27a4e5-3e45-4

In [24]:
#STILL WORKING

from pathlib import Path
import pandas

def _reader(fname):
    return pandas.read_csv(fname, sep='\t', header=None, encoding='utf-8')

folder = Path("text")
df = pandas.concat([
    _reader(txt)
    for txt in folder.glob("*htseq.co.txt")
])

display(df)

Unnamed: 0,0,1
0,ENSG00000000003.13,600
1,ENSG00000000005.5,0
2,ENSG00000000419.11,2851
3,ENSG00000000457.12,2994
4,ENSG00000000460.15,1016
5,ENSG00000000938.11,101
6,ENSG00000000971.14,3360
7,ENSG00000001036.12,1575
8,ENSG00000001084.9,2555
9,ENSG00000001167.13,3214


In [41]:
#CRIANDO DATAFRAMES

#Listando os arquivos no diretorio
files = os.listdir("./text")

#Cria um dataframe com todos os arquivos de texto que possuem "htseq.co" o mesmo pode ser feito pra fpkm e fpkm-uq
htseq = pd.concat([pd.read_csv("./text/htseq/{}".format(item), names=[item[:-4]], index_col=0, sep='\t') for item in files if "htseq.co" in item], axis=1)
display(htseq)

Unnamed: 0,6577de41-48a3-4933-94de-64553f13bf05.htseq.co,d4d6abaa-9134-42ac-9adf-2045f747abb4.htseq.co,ffd7b10e-8c0c-4aeb-81a0-b060108d57fa.htseq.co,eb5bc3d0-9f66-413f-b6bb-72a5a6b7fde9.htseq.co,446db9e9-d5aa-48ba-a84d-cf6a7d8f8fc4.htseq.co,82bfc74a-cb0f-4966-b53e-c7b535ae7f23.htseq.co,2edcaaa7-63b4-40b4-abbe-5d7a84012e60.htseq.co,45f27da5-7fc4-40e2-a78f-3f9af7500804.htseq.co,ae53b898-c61d-4137-824a-104a4202299c.htseq.co,1e215a66-f961-4870-a370-4b945e1f7053.htseq.co,...,67418961-d894-42a9-ae6a-21f41e6ca9bf.htseq.co,fdc8ad41-ba94-485d-8bd6-6ea2794847f2.htseq.co,79c1a9c6-a12e-4e12-8acd-52a8ea0f9dea.htseq.co,bcf137e8-156d-4609-8b1b-9a9b379aa2ce.htseq.co,2c6a02ee-c88a-4e23-8915-6976b95386e3.htseq.co,29c4a2e1-2e48-4f4a-9ad6-9b0a7c04b8e9.htseq.co,a8a58442-78f5-4876-b25e-c04339eb6f26.htseq.co,c1efb56c-bcb2-4f9d-a291-b2c05217de2a.htseq.co,2bb5a0f3-eaa6-4138-8bec-38c7c36c95fc.htseq.co,a6f657a3-439e-4afd-be04-413f2ed02828.htseq.co
ENSG00000000003.13,600,4707,6955,1991,4049,2371,857,1774,1294,1123,...,2464,2110,3219,3846,5404,7036,3655,3658,1359,5017
ENSG00000000005.5,0,1,18,143,413,2,5,0,20,94,...,46,9,267,3,320,7,3,3,7,4
ENSG00000000419.11,2851,1812,1816,4269,1903,1383,1695,1135,965,1345,...,1952,1798,2212,1245,2472,1022,4030,1134,623,6617
ENSG00000000457.12,2994,1215,2030,1896,743,1502,1930,1500,1119,733,...,2423,1198,2562,1360,1483,2065,3514,1542,891,2170
ENSG00000000460.15,1016,625,437,695,210,315,463,346,336,183,...,761,1082,1001,624,312,442,1110,272,352,3222
ENSG00000000938.11,101,599,919,555,673,369,336,135,439,905,...,620,835,398,346,869,166,995,349,473,537
ENSG00000000971.14,3360,2220,993,5493,3631,3546,1290,363,2959,3945,...,4601,1588,8523,4182,5760,1997,8993,967,2702,5337
ENSG00000001036.12,1575,3664,2133,3302,2676,2328,2779,1792,1282,1941,...,3315,3751,3115,2474,4397,2728,13130,3207,1423,3993
ENSG00000001084.9,2555,2292,3305,3053,1499,1111,4496,3892,1138,820,...,2533,3769,2185,4400,2775,3460,4216,745,665,7899
ENSG00000001167.13,3214,1730,2639,6064,1375,2226,2618,1317,1507,1178,...,3552,5530,3108,5074,2060,2407,4048,1698,906,11961
