# Data Commons Pilot - TOPMed Public Metadata Export

This demo shows how metadata can be exported from the gen3 platform.

In [45]:
%matplotlib inline
import dcp_analysis_functions as dcp
dcp.add_keys('credentials.json')

### Get summary metrics for each data type in the data-model for one project:

In [46]:
dcp.query_summary_counts('topmed-public')

Category,Counts
Cases,5.0
,
Studies,1.0
,
Demographic records,5.0
,
Diagnosis records,0.0
,
Exposure records,0.0
,


## Get an index of BAM files

First, we'll define a query to collect the submitted reads for the Public TOPMed data.

In [54]:
from pmap.core import pmap

query = """
query AlignedReads ($projectID: [String], $first: Int, $offset: Int) 
    {
     submitted_aligned_reads(project_id: $projectID, first: $first, offset: $offset) {
        id
        file_name
        file_size
        md5sum
        submitter_id
        updated_datetime
        created_datetime
       _links {
        id
        type
      }
        
      }
    }

"""
variables = {'first': 1, 'offset': 0, 'projectID': 'topmed-public'}
dcp.query_api(query, variables)

def print_query(query, variables):
    response = dcp.query_api(query, variables)
    print(response)
    return response

pages = [{'first': 1, 'offset': i, 'projectID': 'topmed-public'} for i in range(107)]
alignments = map(lambda page: print_query(query, page), pages)

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-15T21:54:21.545596+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD319341.recab.cram', u'md5sum': u'0044399fa0f9356565c8cc0234217338', u'submitter_id': u'NWD319341-cram', u'_links': [{u'type': u'read_group', u'id': u'337a3f79-2c69-4553-b887-16360d693254'}, {u'type': u'aligned_reads_index', u'id': u'28264de4-93b3-44ba-9a0a-b8a45a4a1a22'}, {u'type': u'alignment_workflow', u'id': u'eb0dd251-f364-4085-9c21-a4a995e0fa15'}, {u'type': u'core_metadata', u'id': u'ea7a44f2-6edc-4f59-85ae-2830bec0e9a4'}], u'file_size': 23438579833, u'id': u'73b12502-f60c-5f81-bd35-b0afc4678759'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-31T13:10:18.401187+00:00', u'file_name': u'NWD991001.recab.cram', u'md5sum': u'e1bb6e18d3a61e7c0c83b597733c7e15', u'submitter_id': u'NWD991001-cram', u'_links': [{u'type': u'read_grou

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-31T13:10:18.401187+00:00', u'file_name': u'NWD897509.recab.cram', u'md5sum': u'aac42f2e3228517bccd914b37ee2d3ab', u'submitter_id': u'NWD897509-cram', u'_links': [{u'type': u'read_group', u'id': u'd20a460e-4e52-4c0a-9afa-c9241e13466d'}, {u'type': u'aligned_reads_index', u'id': u'884539b7-9dd9-48e8-b88f-fb1520fa4c42'}, {u'type': u'alignment_workflow', u'id': u'255514c1-d78f-4044-802a-d6ed35e2f2f2'}, {u'type': u'core_metadata', u'id': u'e4599e0e-bc79-4374-80a9-ac684cfb6e77'}], u'file_size': 16643703287, u'id': u'eb434ec6-8521-58b2-8cc7-b6c718ad8ea9'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-31T13:10:18.401187+00:00', u'file_name': u'NWD886731.recab.cram', u'md5sum': u'6e41aea6f511f774260b26de15424a99', u'submitter_id': u'NWD886731-cram', u'_links': [{u'type': u'read_grou

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD805667.recab.cram', u'md5sum': u'74c0d700532b973e96748584a2dc8dad', u'submitter_id': u'NWD805667-cram', u'_links': [{u'type': u'read_group', u'id': u'85f5f7e0-5b22-4dcb-89da-bc4f1d8c6fc6'}, {u'type': u'aligned_reads_index', u'id': u'32669628-341a-4510-bce9-3d3551a95214'}, {u'type': u'alignment_workflow', u'id': u'1a52df4e-df6c-4cd5-bf27-b6e3477400ac'}, {u'type': u'core_metadata', u'id': u'333cf17a-352f-4133-a042-d4705c09e650'}], u'file_size': 21964459067, u'id': u'adcda4e3-9b2b-5aba-b00b-ace40d14d1a3'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD791319.recab.cram', u'md5sum': u'6315166a1ed3e7c6cc04b2108102f05a', u'submitter_id': u'NWD791319-cram', u'_links': [{u'type': u'read_grou

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-31T13:10:18.401187+00:00', u'file_name': u'NWD754590.recab.cram', u'md5sum': u'599431c7288be66e3818c0985edbac85', u'submitter_id': u'NWD754590-cram', u'_links': [{u'type': u'read_group', u'id': u'5c70d56c-5638-4407-b36f-c05af7f554c7'}, {u'type': u'aligned_reads_index', u'id': u'4b5cf2a8-05eb-49ce-93b4-b72f17dbb90a'}, {u'type': u'alignment_workflow', u'id': u'9e610999-76e9-43f3-b31c-f83209008f6c'}, {u'type': u'core_metadata', u'id': u'd120e3e8-42d7-4bce-9115-34b8731a28b3'}], u'file_size': 28978819118, u'id': u'08d151ba-8593-5621-841e-167cad956657'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-31T13:10:18.401187+00:00', u'file_name': u'NWD746396.recab.cram', u'md5sum': u'919987c75a93cce6645bb752314b3e84', u'submitter_id': u'NWD746396-cram', u'_links': [{u'type': u'read_grou

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-31T13:10:18.401187+00:00', u'file_name': u'NWD651359.recab.cram', u'md5sum': u'5e3b51845b1167f7d8b1f78953b0ac87', u'submitter_id': u'NWD651359-cram', u'_links': [{u'type': u'read_group', u'id': u'de6aeb1b-3e6c-4135-b8d5-4ef530df3b3e'}, {u'type': u'aligned_reads_index', u'id': u'faf60f17-2e18-4e7a-a6a8-80b30d77b3a9'}, {u'type': u'alignment_workflow', u'id': u'91ff9cc1-9b32-4664-ac11-a7f7bbc2f973'}, {u'type': u'core_metadata', u'id': u'c093f0cc-9a42-406c-92ce-38aef8d3db5c'}], u'file_size': 21917446857, u'id': u'1c5de3b1-69dd-543b-bd2a-dfa41c7a2921'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-31T13:10:18.401187+00:00', u'file_name': u'NWD651283.recab.cram', u'md5sum': u'7ef80ed0f53f250123ac2860f199314d', u'submitter_id': u'NWD651283-cram', u'_links': [{u'type': u'read_grou

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD548310.recab.cram', u'md5sum': u'140c79cdc6a0aac8f567925734e9e89f', u'submitter_id': u'NWD548310-cram', u'_links': [{u'type': u'read_group', u'id': u'09c8a109-087c-468e-a3ae-096112684f27'}, {u'type': u'aligned_reads_index', u'id': u'b5d72965-bbcf-4202-a5f0-9c593c965c91'}, {u'type': u'alignment_workflow', u'id': u'1062a71c-5150-4af7-b072-fe7a611553f0'}, {u'type': u'core_metadata', u'id': u'66f13fb3-5896-42e2-b93c-da66393f84f1'}], u'file_size': 23046599563, u'id': u'fb50df74-3602-5692-be16-17f41b84de01'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD535753.recab.cram', u'md5sum': u'd63b713b54ce42d9f0a765b88517ae3a', u'submitter_id': u'NWD535753-cram', u'_links': [{u'type': u'read_grou

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD470340.recab.cram', u'md5sum': u'5e7d55ce0e71c1e6aaac8ecf5decd00c', u'submitter_id': u'NWD470340-cram', u'_links': [{u'type': u'read_group', u'id': u'e7653168-4308-48c1-bc07-fa76ea1448eb'}, {u'type': u'aligned_reads_index', u'id': u'bb0a9d07-65d6-4b9d-bd12-8fa5e5ccd050'}, {u'type': u'alignment_workflow', u'id': u'22ba898c-3b4b-4679-b0a5-5c04862cc839'}, {u'type': u'core_metadata', u'id': u'9fd50a3a-e3b7-4ecf-8c96-5acdabd4249f'}], u'file_size': 17718046948, u'id': u'2eaea47c-7f04-5d9a-a65d-ba0a9e61d951'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD465900.recab.cram', u'md5sum': u'0e50a4643166261ed442a90d7fe981c1', u'submitter_id': u'NWD465900-cram', u'_links': [{u'type': u'read_grou

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD315403.recab.cram', u'md5sum': u'3668a7438c86d4f64ce7b3f9cfa8c887', u'submitter_id': u'NWD315403-cram', u'_links': [{u'type': u'read_group', u'id': u'a1160382-0d97-477c-b0db-e89a907cde16'}, {u'type': u'aligned_reads_index', u'id': u'842baf63-6539-44d1-8de7-7eea0d8c020e'}, {u'type': u'alignment_workflow', u'id': u'9c0f7b06-c6fa-465f-8f0c-732b4c82e813'}, {u'type': u'core_metadata', u'id': u'a24bb063-7d49-4c59-9830-4297df752c14'}], u'file_size': 19404138841, u'id': u'1cd619cf-7a81-5286-a623-fd58b4720abd'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD315195.recab.cram', u'md5sum': u'672667330160da3b80c24bddfa68da14', u'submitter_id': u'NWD315195-cram', u'_links': [{u'type': u'read_grou

{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD230091.recab.cram', u'md5sum': u'e89690ec6d76bb34cb521c0b53b3347e', u'submitter_id': u'NWD230091-cram', u'_links': [{u'type': u'read_group', u'id': u'5df87f40-fb8c-430a-83f0-97df02ce23fd'}, {u'type': u'aligned_reads_index', u'id': u'f1af8f44-7439-468d-a168-65d7064c9a62'}, {u'type': u'alignment_workflow', u'id': u'e81a47c5-5443-4375-b72c-bf6cf4e5a67a'}, {u'type': u'core_metadata', u'id': u'7cd1c7f2-e0b3-4090-8454-caf9563b2539'}], u'file_size': 38868223528, u'id': u'1f7730ef-42b9-51bd-b74e-0fc3e991d9e4'}]}}
{u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-14T19:57:04.632848+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD191048.recab.cram', u'md5sum': u'ec324e174789e9f35b1842dcf3cc34ab', u'submitter_id': u'NWD191048-cram', u'_links': [{u'type': u'read_grou

### Get the indexes for each alignment

We'll now make a list of the indices for each alignment as well.

In [55]:
query = query = """
query ReadIndexes ($id: String) 
    {
     aligned_reads_index(id: $id) {
        id
        file_name
        file_size
        md5sum
        submitter_id
        updated_datetime
        created_datetime
      }
    }

"""
variables = {'first': 1, 'offset': 0, 'projectID': 'topmed-public'}

index_ids = []

for alignment in alignments:
    for link in alignment['data']['submitted_aligned_reads'][0]['_links']:
        if link['type'] == 'aligned_reads_index':
            index_ids.append(link['id'])

print(len(alignments))
print(len(index_ids))
pages = [{'id': x} for x in index_ids]
            
indices = map(lambda page: print_query(query, page), pages)

107
107
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD319341.recab.cram.crai', u'md5sum': u'c5fd2b9973f68840a4d9f155376d009a', u'submitter_id': u'NWD319341-crai', u'file_size': 1429317, u'id': u'28264de4-93b3-44ba-9a0a-b8a45a4a1a22'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-31T13:10:52.768064+00:00', u'file_name': u'NWD991001.recab.cram.crai', u'md5sum': u'b68e51eb464ce2450be49236ad5d1bf8', u'submitter_id': u'NWD991001-crai', u'file_size': 1472693, u'id': u'1db6a13d-7c08-47d8-967d-cc2dca2accf1'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-31T13:10:52.768064+00:00', u'file_name': u'NWD968809.recab.cram.crai', u'md5sum': u'7a2436d139d61904d9f4607ea8e06139', u'submitter_id': u'NWD968809-crai', u

{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD805667.recab.cram.crai', u'md5sum': u'00a90dca71b204f23f990fd7949fa31e', u'submitter_id': u'NWD805667-crai', u'file_size': 1543172, u'id': u'32669628-341a-4510-bce9-3d3551a95214'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD791319.recab.cram.crai', u'md5sum': u'83ec7aba08568885338a0bcaec70ff73', u'submitter_id': u'NWD791319-crai', u'file_size': 1632221, u'id': u'6f1f6566-c2c5-4f49-9801-ef0bfb1eb594'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD790783.recab.cram.crai', u'md5sum': u'eea145c27504595a7a050c1ffb71129d', u'submitter_id': u'NWD790783-crai', u'file_si

{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-31T13:10:52.768064+00:00', u'file_name': u'NWD651359.recab.cram.crai', u'md5sum': u'd2fb8f943c737c43eebbf55cb6c37828', u'submitter_id': u'NWD651359-crai', u'file_size': 1369119, u'id': u'faf60f17-2e18-4e7a-a6a8-80b30d77b3a9'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-31T13:10:52.768064+00:00', u'file_name': u'NWD651283.recab.cram.crai', u'md5sum': u'5351eb7794e34ee21f05a462b17de713', u'submitter_id': u'NWD651283-crai', u'file_size': 1600894, u'id': u'51d67e4d-4d0e-4ce4-9052-4add5c39de9c'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-31T13:10:52.768064+00:00', u'file_name': u'NWD651125.recab.cram.crai', u'md5sum': u'e68e97d8cdba017ea2244943f13920f4', u'submitter_id': u'NWD651125-crai', u'file_si

{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD470340.recab.cram.crai', u'md5sum': u'c28362d8200c2fe7667cd3839a4e9e9f', u'submitter_id': u'NWD470340-crai', u'file_size': 1686586, u'id': u'bb0a9d07-65d6-4b9d-bd12-8fa5e5ccd050'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD465900.recab.cram.crai', u'md5sum': u'16cef10bcee60df0c92f9576cfb9fc72', u'submitter_id': u'NWD465900-crai', u'file_size': 1133121, u'id': u'88678b10-0e14-48f2-a393-cb5a30843177'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD463423.recab.cram.crai', u'md5sum': u'cd8326bd93c9f99b1ec922fc6f16670e', u'submitter_id': u'NWD463423-crai', u'file_si

{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD230091.recab.cram.crai', u'md5sum': u'f1620b1d31c626d06b3f33d6b0b1b86d', u'submitter_id': u'NWD230091-crai', u'file_size': 2383606, u'id': u'f1af8f44-7439-468d-a168-65d7064c9a62'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD191048.recab.cram.crai', u'md5sum': u'532aec198bbe91c3636210661a5e1e25', u'submitter_id': u'NWD191048-crai', u'file_size': 1192848, u'id': u'eb073361-a7ac-4d90-836e-7e45980a9172'}]}}
{u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD183321.recab.cram.crai', u'md5sum': u'0f375f5d2778908108fbe99b16e9671c', u'submitter_id': u'NWD183321-crai', u'file_si

Now that we can get metadata for submitted aligned reads and indices, we can begin to generate a bundle like object that will bring the indices and alignments into a single dictionary.

In [56]:
raw_bundles = zip(alignments, indices)
print(raw_bundles[0])

({u'data': {u'submitted_aligned_reads': [{u'updated_datetime': u'2018-04-15T21:54:21.545596+00:00', u'created_datetime': u'2018-01-30T21:37:29.252021+00:00', u'file_name': u'NWD319341.recab.cram', u'md5sum': u'0044399fa0f9356565c8cc0234217338', u'submitter_id': u'NWD319341-cram', u'_links': [{u'type': u'read_group', u'id': u'337a3f79-2c69-4553-b887-16360d693254'}, {u'type': u'aligned_reads_index', u'id': u'28264de4-93b3-44ba-9a0a-b8a45a4a1a22'}, {u'type': u'alignment_workflow', u'id': u'eb0dd251-f364-4085-9c21-a4a995e0fa15'}, {u'type': u'core_metadata', u'id': u'ea7a44f2-6edc-4f59-85ae-2830bec0e9a4'}], u'file_size': 23438579833, u'id': u'73b12502-f60c-5f81-bd35-b0afc4678759'}]}}, {u'data': {u'aligned_reads_index': [{u'updated_datetime': u'2018-04-14T19:57:36.324661+00:00', u'created_datetime': u'2018-01-30T21:47:17.975531+00:00', u'file_name': u'NWD319341.recab.cram.crai', u'md5sum': u'c5fd2b9973f68840a4d9f155376d009a', u'submitter_id': u'NWD319341-crai', u'file_size': 1429317, u'id': 

## Getting indexd URLs for submissions

Now that we have a way of generating the list of alignments to be exported, along with their indices, we also want to provide the cloud URLs where these can be found.

We can get these links from indexd using the identifiers gathered from the metadata service above.

In [65]:
import requests

indexd_url = 'https://dcp.bionimbus.org/index'

requests.get("{}/{}".format(indexd_url, '28264de4-93b3-44ba-9a0a-b8a45a4a1a22'))

<Response [404]>