In [1]:
import requests
import json
import pandas as pd

In [2]:
manifest = pd.read_csv("/Users/zhuy/Downloads/gdc_manifest.2021-06-04.txt", sep='\t')

In [4]:
gdc_url = 'https://api.gdc.cancer.gov/files'
headers = {'Content-Type': 'application/json'}

fields = [
    'file_name',
    'analysis.metadata.read_groups.read_group_id',
    'analysis.metadata.read_groups.library_selection',
    'analysis.metadata.read_groups.library_strand',
    'analysis.metadata.read_groups.read_length'
]
fields = ','.join(fields)

In [5]:
payload = {
        'filters':{
            'op':'=',
            'content':{
                'field':'files.file_id',
                'value':manifest.id.tolist()}},
        'format':'json',
        'fields':fields,
        'size':5000 # make sure we get all the returns
}
payload = json.dumps(payload)

In [6]:
gdc_response = requests.post(gdc_url, headers=headers, data=payload)
gdc_response = gdc_response.json()

In [8]:
lib_info = []
for i in gdc_response['data']['hits']:
    for j in i['analysis']['metadata']['read_groups']:
        try:
            j['library_strand']
        except:
            j['library_strand'] = "N/A"
        lib_info.append([
            i['file_name'],
            j['read_group_id'], 
            j['library_selection'],
            j['library_strand'],
            j['read_length']
        ])

In [9]:
df = pd.DataFrame(lib_info).drop_duplicates()
df.columns = ['file_name','read_group_id','library_selection','library_strand','read_length']
df

Unnamed: 0,file_name,read_group_id,library_selection,library_strand,read_length
0,30d453ef-20ad-45b7-8df5-b611a18fd246.rna_seq.g...,f1a21119-f562-4169-902b-4fdbb3fa29b3,Poly-T Enrichment,,101
1,ed8cedc6-8fb6-41bf-ae0b-f270e7e4dee6.rna_seq.g...,6d29d470-303b-4b47-93b2-99f120f1e176,Other,,0
2,631e94bd-6701-44fe-8906-81343d23b48f.rna_seq.g...,512ef7e2-bd1d-4426-b1e5-0e6ab82c3d58,Other,,0
3,b66acbe1-b96e-4b10-a2be-f0ea9b44286e.rna_seq.g...,1c8edb97-8f01-4e49-9beb-d29590e7def7,Other,,0
4,733d8a0f-9696-4a21-b15c-fea97a6ae45e.rna_seq.g...,1b16b425-8a1b-4856-bb5c-a5039e9378cc,Poly-T Enrichment,,81
...,...,...,...,...,...
1646,38e25cba-73ca-486c-ae65-bcaab23b9ac1.rna_seq.g...,a5fa591f-9c4d-48ab-a60a-3bafc76a5386,Other,,0
1647,513bc281-785d-4451-af46-aaa1a98c5827.rna_seq.g...,3e9b8731-abc5-4756-b0cb-4dc6206339d3,Other,,0
1648,da21fe89-67e0-4a85-b98c-228534b8e0bf.rna_seq.g...,6a3009fd-e6ff-4342-8adb-8e5ef91f4e02,,,75
1649,c80e1288-9209-4021-9339-a1c50c8b4d2c.rna_seq.g...,2b1b9c41-f1fd-424f-882c-10f80a0a19ad,,,75


In [10]:
df.to_csv('./target-rnaseq.lib-info.csv', index=False)