In [1]:
import requests
import json
import pandas as pd

In [71]:
gdc_url = 'https://api.gdc.cancer.gov/files'
headers = {'Content-Type': 'application/json'}

fields = [
    'file_id',
    'file_name',
    'data_category',
    'data_type',
    'experimental_strategy',
    'cases.submitter_id',
    'cases.samples.submitter_id',
    'cases.samples.sample_type',
    'cases.samples.portions.analytes.aliquots.submitter_id',
    'analysis.workflow_type',
    'analysis.metadata.read_groups.read_group_id',
    'analysis.metadata.read_groups.target_capture_kit_name',
    'analysis.metadata.read_groups.target_capture_kit_target_region'
]
fields = ','.join(fields)

manifest = pd.read_csv("/Users/zhuy/Downloads/gdc_manifest.2021-06-28-1.txt", sep='\t')

In [72]:
payload = {
        'filters':{
            'op':'=',
            'content':{
                'field':'file_id',
                'value':manifest.id.tolist()
            }
        },
        'format':'json',
        'fields':fields,
        'size':5000 # make sure we get all the returns
}
payload = json.dumps(payload)
gdc_response = requests.post(gdc_url, headers=headers, data=payload)
gdc_response = gdc_response.json()

In [80]:
lib_info = []
for i in gdc_response['data']['hits']:
    case = i['cases'][0]
    sample = case['samples'][0]
    aliquot = sample['portions'][0]['analytes'][0]['aliquots'][0]
    try:
        analysis = i['analysis']
    except:
        analysis = {
            'workflow_type':'null',
            'metadata':{
                'read_groups':[{
                    'target_capture_kit_name':'null',
                    'target_capture_kit_target_region':'null',
                    'read_group_id':'null'
                }]
            }
        }
    read_groups = analysis['metadata']['read_groups']
    for j in read_groups:
        lib_info.append([
            i['file_id'], i['file_name'], i['experimental_strategy'],
            i['data_type'], i['data_category'],
            case['submitter_id'],
            sample['submitter_id'], sample['sample_type'],
            aliquot['submitter_id'],
            analysis['workflow_type'],
            j['read_group_id'],
            j['target_capture_kit_name'],
            j['target_capture_kit_target_region']
        ])

In [81]:
df = pd.DataFrame(lib_info).drop_duplicates()
df.columns = [
    'file_id',
    'file_name',
    'experimental_strategy',
    'data_type',
    'data_category',
    'case_id',
    'sample_id',
    'sample_type',
    'aliquot_id',
    'workflow_type',
    'read_group_id',
    'target_capture_kit_name',
    'target_capture_kit_target_region'
]
df

Unnamed: 0,file_id,file_name,experimental_strategy,data_type,data_category,case_id,sample_id,sample_type,aliquot_id,workflow_type,read_group_id,target_capture_kit_name,target_capture_kit_target_region
0,88ae30af-4e26-4ebb-9fdf-c6bf465bed5b,fc64f2ec-270a-4d0d-b976-aba7d5afa357_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-NAAGJT,TARGET-40-NAAGJT-10B,Blood Derived Normal,TARGET-40-NAAGJT-10B-01D,BWA with Mark Duplicates and BQSR,5cee1f2f-3505-42a9-a2c8-f8feb0151d47,,
1,88ae30af-4e26-4ebb-9fdf-c6bf465bed5b,fc64f2ec-270a-4d0d-b976-aba7d5afa357_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-NAAGJT,TARGET-40-NAAGJT-10B,Blood Derived Normal,TARGET-40-NAAGJT-10B-01D,BWA with Mark Duplicates and BQSR,d8a45ced-d16b-400c-ad58-1812f4a0ec41,,
2,57e0e578-3bf9-4859-b821-a75cec5ee0da,2f016ec8-7ab4-42f6-9a4b-0b17b231d3ac_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-0A4I8O,TARGET-40-0A4I8O-10A,Blood Derived Normal,TARGET-40-0A4I8O-10A-01D,BWA with Mark Duplicates and BQSR,0da41968-16d3-4db3-b761-680c890bd6ee,,
3,b456d94f-9b65-4df9-b641-d889d0b0e273,5408aaba-d172-525f-9c8d-22c9b6b88eef_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-PANPUM,TARGET-40-PANPUM-10A,Blood Derived Normal,TARGET-40-PANPUM-10A-01D,BWA with Mark Duplicates and BQSR,1f115a0d-7918-49fb-a97a-d8b88db3ce6f,,
4,1292f8ac-24ce-4f94-8fe9-e3da52e5ae49,47ac282c-9780-5606-8173-688dda823a1b_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-NAAEDH,TARGET-40-NAAEDH-08A,Post neo-adjuvant therapy,TARGET-40-NAAEDH-08A-01D,BWA with Mark Duplicates and BQSR,899646a7-d428-4616-a32e-c155fc7d1826,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10675,77435788-f3c6-4059-a14f-001e949ddd57,ef5ac7e5-ca44-592c-96ec-4640c6c916bc_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-PAUTYB,TARGET-40-PAUTYB-01A,Primary Tumor,TARGET-40-PAUTYB-01A-01D,BWA with Mark Duplicates and BQSR,05a19385-7d77-4429-83fe-42f2cae7f70f,,
10676,77435788-f3c6-4059-a14f-001e949ddd57,ef5ac7e5-ca44-592c-96ec-4640c6c916bc_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-PAUTYB,TARGET-40-PAUTYB-01A,Primary Tumor,TARGET-40-PAUTYB-01A-01D,BWA with Mark Duplicates and BQSR,6a63607a-1809-4813-a827-67ce954edf25,,
10677,77435788-f3c6-4059-a14f-001e949ddd57,ef5ac7e5-ca44-592c-96ec-4640c6c916bc_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-PAUTYB,TARGET-40-PAUTYB-01A,Primary Tumor,TARGET-40-PAUTYB-01A-01D,BWA with Mark Duplicates and BQSR,77cac466-ce20-499c-aa54-b61dda3301ee,,
10678,77435788-f3c6-4059-a14f-001e949ddd57,ef5ac7e5-ca44-592c-96ec-4640c6c916bc_wxs_gdc_r...,WXS,Aligned Reads,Sequencing Reads,TARGET-40-PAUTYB,TARGET-40-PAUTYB-01A,Primary Tumor,TARGET-40-PAUTYB-01A-01D,BWA with Mark Duplicates and BQSR,8e741adf-1ee2-4943-be39-6fe450497a26,,


In [82]:
df.to_csv('./target-wxs.lib-info.csv', index=False, sep="\t")

In [78]:
import json
with open('data.json', 'w') as f:
    json.dump(gdc_response, f)

In [91]:
gdc_url = 'https://api.gdc.cancer.gov/files'
headers = {'Content-Type': 'application/json'}

fields = [
    'experimental_strategy',
    'cases.samples.portions.analytes.aliquots.submitter_id',
    'analysis.workflow_type'
]
fields = ','.join(fields)
payload = {
        'filters':{
            'op':'=',
            'content':{
                'field':'cases.samples.portions.analytes.aliquots.submitter_id',
                'value':[
                    'TARGET-30-PAPHPE-01A-01W',
                    'TARGET-40-0A4I42-01A-01D',
                    'TARGET-50-PAJMLZ-01A-01D',
                    'TARGET-50-PAEAFB-01A-01D',
                    'TARGET-50-PAJMIZ-01A-01D',
                    'TARGET-50-PAKULH-01A-02D',
                    'TARGET-40-PAUBIT-01A-01D',
                    'TARGET-40-PAUUML-01A-01D',
                    'TARGET-40-PAUVUL-01A-01D',
                    'TARGET-40-PAVALD-01A-01D',
                    'TARGET-40-PAUYTT-01A-01D',
                    'TARGET-40-PAVCLP-01A-01D',
                    'TARGET-10-PATXNR-09A-01D',
                    'TARGET-10-PATZYR-03A-01D',
                    'TARGET-10-PASMHF-09A-01D',
                    'TARGET-10-PASRMM-09A-01D',
                    'TARGET-10-PASSHC-09A-01D',
                    'TARGET-10-PASWSR-09A-01D',
                    'TARGET-10-PATEMI-09A-01D',
                    'TARGET-10-PATEYS-09A-01D',
                    'TARGET-10-PARPRW-04A-01D',
                    'TARGET-20-PAEEYP-04A-01D',
                    'TARGET-20-PAKKBK-04A-01D',
                    'TARGET-20-PASRTP-40A-01D',
                    'TARGET-40-PATKSS-01A-01D',
                    'TARGET-30-PAPHPE-10A-01W',
                    'TARGET-30-PAMMXF-10A-01W',
                    'TARGET-40-PAVECB-10A-01D',
                    'TARGET-50-PAKXXF-10A-01D',
                    'TARGET-50-PALDTE-10A-01D',
                    'TARGET-40-PAUBIT-10A-01Y',
                    'TARGET-40-PAUUML-10A-01D',
                    'TARGET-40-PAUVUL-10A-01D',
                    'TARGET-40-PAVALD-10A-01D',
                    'TARGET-40-PAUYTT-10A-01D',
                    'TARGET-40-PAUTWB-10A-01D',
                    'TARGET-40-PAUTYB-10A-01D',
                    'TARGET-40-PAUXPZ-10A-01D',
                    'TARGET-40-PAVCLP-10A-01D',
                    'TARGET-40-PAVDTY-10A-01D',
                    'TARGET-10-PASKAY-10A-01D',
                    'TARGET-10-PATRUN-10A-01D',
                    'TARGET-10-PAUBCB-10A-01D',
                    'TARGET-10-PAUBCT-10A-01D',
                    'TARGET-10-PASUSV-10A-01D',
                    'TARGET-10-PASWZJ-10A-01D',
                    'TARGET-10-PATEMI-10A-01D',
                    'TARGET-10-PATFWF-10A-01D',
                    'TARGET-40-PATKSS-10A-01D',
                    'TARGET-30-PAPTAN-14A-01D'

                ]
            }
        },
        'format':'json',
        'fields':fields,
        'size':5000 # make sure we get all the returns
}
payload = json.dumps(payload)
gdc_response = requests.post(gdc_url, headers=headers, data=payload)
gdc_response = gdc_response.json()
gdc_response
# lib_info = []
# for i in gdc_response['data']['hits']:
#     case = i['cases'][0]
#     sample = case['samples'][0]
#     aliquot = sample['portions'][0]['analytes'][0]['aliquots'][0]
#     try:
#         analysis = i['analysis']
#     except:
#         analysis = {
#             'workflow_type':'null',
#             'metadata':{
#                 'read_groups':[{
#                     'target_capture_kit_name':'null',
#                     'target_capture_kit_target_region':'null',
#                     'read_group_id':'null'
#                 }]
#             }
#         }
#     read_groups = analysis['metadata']['read_groups']
#     for j in read_groups:
#         lib_info.append([
#             i['file_id'], i['file_name'], i['experimental_strategy'],
#             i['data_type'], i['data_category'],
#             case['submitter_id'],
#             sample['submitter_id'], sample['sample_type'],
#             aliquot['submitter_id'],
#             analysis['workflow_type'],
#             j['read_group_id'],
#             j['target_capture_kit_name'],
#             j['target_capture_kit_target_region']
#         ])
# df = pd.DataFrame(lib_info).drop_duplicates()
# df.columns = [
#     'file_id',
#     'file_name',
#     'experimental_strategy',
#     'data_type',
#     'data_category',
#     'case_id',
#     'sample_id',
#     'sample_type',
#     'aliquot_id',
#     'workflow_type',
#     'read_group_id',
#     'target_capture_kit_name',
#     'target_capture_kit_target_region'
# ]
# df

{'data': {'hits': [{'id': '8f6a3259-9181-42ad-8561-f9b0b17b108e',
    'cases': [{'samples': [{'portions': [{'analytes': [{'aliquots': [{'submitter_id': 'TARGET-40-PAUYTT-01A-01D'}]}]}]},
       {'portions': [{'analytes': [{'aliquots': [{'submitter_id': 'TARGET-40-PAUYTT-10A-01D'}]}]}]}]}],
    'analysis': {'workflow_type': 'ASCAT2'},
    'experimental_strategy': 'Genotyping Array'},
   {'id': '3e7c17a1-af6f-4011-88d1-17b1f3b73cb7',
    'cases': [{'samples': [{'portions': [{'analytes': [{'aliquots': [{'submitter_id': 'TARGET-40-PAUYTT-01A-01D'}]}]}]},
       {'portions': [{'analytes': [{'aliquots': [{'submitter_id': 'TARGET-40-PAUYTT-10A-01D'}]}]}]}]}],
    'analysis': {'workflow_type': 'ASCAT2'},
    'experimental_strategy': 'Genotyping Array'},
   {'id': 'ba3fb3c7-4beb-4d19-8a32-71084a8b222f',
    'cases': [{'samples': [{'portions': [{'analytes': [{'aliquots': [{'submitter_id': 'TARGET-40-PATKSS-10A-01D'}]}]}]},
       {'portions': [{'analytes': [{'aliquots': [{'submitter_id': 'TARGET

In [92]:
json_formatted_str = json.dumps(gdc_response, indent=2)
print(json_formatted_str)

{
  "data": {
    "hits": [
      {
        "id": "8f6a3259-9181-42ad-8561-f9b0b17b108e",
        "cases": [
          {
            "samples": [
              {
                "portions": [
                  {
                    "analytes": [
                      {
                        "aliquots": [
                          {
                            "submitter_id": "TARGET-40-PAUYTT-01A-01D"
                          }
                        ]
                      }
                    ]
                  }
                ]
              },
              {
                "portions": [
                  {
                    "analytes": [
                      {
                        "aliquots": [
                          {
                            "submitter_id": "TARGET-40-PAUYTT-10A-01D"
                          }
                        ]
                      }
                    ]
                  }
                ]
              }
            ]
         