# Export Arbitrary Metadata from DCP

The sheepdog submission tracking software makes it easy to see what has been loaded into the DCP platform.


## List the Metadata Dictionary by Project

In [15]:
import dcp_analysis_functions as dcp
dcp.add_keys('credentials.json')

base_url = "https://dcp.bionimbus.org/api/v0/submission"
program_name = "topmed"
project_name = "public"

def get_dictionary(program, project):
    return requests.get('{}/{}/{}/_dictionary'.format(base_url, program, project), headers={'Authorization': 'bearer '+ dcp.auth.json()['access_token']}).json()

print(get_dictionary(program_name, project_name))

{u'links': [u'/v0/submission/topmed/public/_dictionary/medication', u'/v0/submission/topmed/public/_dictionary/electrocardiogram_test', u'/v0/submission/topmed/public/_dictionary/aggregated_snp_array', u'/v0/submission/topmed/public/_dictionary/medical_history', u'/v0/submission/topmed/public/_dictionary/sample', u'/v0/submission/topmed/public/_dictionary/psychosocial_questionnaire', u'/v0/submission/topmed/public/_dictionary/death_record', u'/v0/submission/topmed/public/_dictionary/simple_germline_variation', u'/v0/submission/topmed/public/_dictionary/core_metadata', u'/v0/submission/topmed/public/_dictionary/primary_history', u'/v0/submission/topmed/public/_dictionary/read_group_qc', u'/v0/submission/topmed/public/_dictionary/data_collection', u'/v0/submission/topmed/public/_dictionary/publication', u'/v0/submission/topmed/public/_dictionary/cardiac_ct_scan', u'/v0/submission/topmed/public/_dictionary/demographic', u'/v0/submission/topmed/public/_dictionary/copy_number_workflow', u'/

## Get the JSON Schema for a metadata type

Using the DCP you can get the JSON schema to validate metadata entries.

In [18]:
def get_schema_for_type(program, project, my_type):
    return requests.get('{}/{}/{}/_dictionary/{}'.format(base_url, program, project, my_type), headers={'Authorization': 'bearer '+ dcp.auth.json()['access_token']}).json()

In [85]:
get_schema_for_type('topmed', 'public', 'cardiac_mri')['properties'].keys()

[u'osbse1',
 u'submitter_id',
 u'apoea11',
 u'odbsd1',
 u'oaormx1',
 u'odapf1',
 u'lungmss1',
 u'massnd1',
 u'osapc1',
 u'project_id',
 u'osbsd1',
 u'r6wm1',
 u'odmda1',
 u'olvef1',
 u'odbse1',
 u'apoea21',
 u'olvsv1t',
 u'globscr1',
 u'omassv1',
 u'postdia1',
 u'osapb1',
 u'osmdf1',
 u'lvedd1c',
 u'r7wm1',
 u'odmdf1',
 u'r12wm1',
 u'odbsf1',
 u'olvedm1t',
 u'osmde1',
 u'created_datetime',
 u'odapd1',
 u'r4wm1',
 u'r15wm1',
 u'osapa1',
 u'state',
 u'refstat1',
 u'predia1',
 u'olvsv1',
 u'aneursm1',
 u'crdout1c',
 u'perieff1',
 u'pheval1',
 u'othalrt1',
 u'osmdd1',
 u'mripp1',
 u'odape1',
 u'ppa_101',
 u'olvesv1',
 u'osbsa1',
 u'osv1',
 u'r3wm1',
 u'postsys1',
 u'olvesv1t',
 u'updated_datetime',
 u'r16wm1',
 u'oardis1',
 u'osbsf1',
 u'osmdc1',
 u'r10wm1',
 u'type',
 u'odapb1',
 u'olvedv1t',
 u'aad1c',
 u'cases',
 u'datrcvd1',
 u'olvef1t',
 u'odbsa1',
 u'osmdb1',
 u'odmdd1',
 u'osapf1',
 u'r8wm1',
 u'odapc1',
 u'ovpm1',
 u'r2wm1',
 u'osbsc1',
 u'odmdb1',
 u'presys1',
 u'id',
 u'osape1',


## Getting Metadata for each Type

Now that we know all the types in the metadata dictionary for a given program and project, we can get the submissions for each.

In [38]:
def get_submissions_by_type(program, project, my_type):
    
    url = '{}/{}/{}/export/?node_label={}'.format(base_url, program, project, my_type)
    # FIXME returns a TSV
    return requests.get(url, headers={'Authorization': 'bearer '+ dcp.auth.json()['access_token']}).content

In [84]:
get_submissions_by_type('topmed', 'public', 'submitted_aligned_reads')[0:500]

'type\tid\tproject_id\tsubmitter_id\tdata_category\tdata_format\tdata_type\texperimental_strategy\tfile_name\tfile_size\tmd5sum\tcore_metadata_records.id\tcore_metadata_records.submitter_id\tread_groups.id#1\tread_groups.submitter_id#1\nsubmitted_aligned_reads\t7733b6cb-811f-5839-a5dc-ac4262ded126\ttopmed-public\tNWD100953-cram\tSequencing Reads\tCRAM\tAligned Reads\tWGS\tNWD100953.recab.cram\t37501686827\tf9e72bdf25bf4b4f0e581d9218fec2eb\t76c66ee2-f0a7-455b-af2f-57461129c9c5\tNWD100953-cram-core-metadata\t4dd92bb3-4e67-41f'

### Coercing to JSON

As a workaround to a current problem in the DCP serialization, we convert the TSV contents back to a dictionary.



In [56]:
import csv
from StringIO import StringIO

def get_json_submission_by_type(program, project, my_type):
    buf = StringIO(get_submissions_by_type(program, project, my_type))
    reader = csv.DictReader(buf, delimiter='\t')
    return list(reader)

In [81]:
get_json_submission_by_type('topmed', 'public', 'submitted_aligned_reads')[0]

{'core_metadata_records.id': '76c66ee2-f0a7-455b-af2f-57461129c9c5',
 'core_metadata_records.submitter_id': 'NWD100953-cram-core-metadata',
 'data_category': 'Sequencing Reads',
 'data_format': 'CRAM',
 'data_type': 'Aligned Reads',
 'experimental_strategy': 'WGS',
 'file_name': 'NWD100953.recab.cram',
 'file_size': '37501686827',
 'id': '7733b6cb-811f-5839-a5dc-ac4262ded126',
 'md5sum': 'f9e72bdf25bf4b4f0e581d9218fec2eb',
 'project_id': 'topmed-public',
 'read_groups.id#1': '4dd92bb3-4e67-41fd-a5c1-37e30155c208',
 'read_groups.submitter_id#1': 'NWD100953-rg',
 'submitter_id': 'NWD100953-cram',
 'type': 'submitted_aligned_reads'}

## Selecting all the fields for export

We can now create a function that will export all of the fields types for us.

In [58]:
import os

def get_all_submissions(program, project):
    return_dictionary = {}
    submission_dictionary = get_dictionary(program, project)
    for link in submission_dictionary['links']:
        my_type = os.path.basename(link)
        return_dictionary[link] = get_json_submission_by_type(program, project, my_type)
    return return_dictionary

In [59]:
%time all_fields = get_all_submissions('topmed', 'public')

CPU times: user 704 ms, sys: 24 ms, total: 728 ms
Wall time: 27.3 s


In [80]:
all_fields['/v0/submission/topmed/public/_dictionary/submitted_aligned_reads'][0]

{'core_metadata_records.id': '76c66ee2-f0a7-455b-af2f-57461129c9c5',
 'core_metadata_records.submitter_id': 'NWD100953-cram-core-metadata',
 'data_category': 'Sequencing Reads',
 'data_format': 'CRAM',
 'data_type': 'Aligned Reads',
 'experimental_strategy': 'WGS',
 'file_name': 'NWD100953.recab.cram',
 'file_size': '37501686827',
 'id': '7733b6cb-811f-5839-a5dc-ac4262ded126',
 'md5sum': 'f9e72bdf25bf4b4f0e581d9218fec2eb',
 'project_id': 'topmed-public',
 'read_groups.id#1': '4dd92bb3-4e67-41fd-a5c1-37e30155c208',
 'read_groups.submitter_id#1': 'NWD100953-rg',
 'submitter_id': 'NWD100953-cram',
 'type': 'submitted_aligned_reads'}

Since there are dictionary items for each metadata type, it should be possible to use annotate entries with their schema, however, since the serialization goes through TSV that is left for future work.

## Getting Metadata for Another Project

We can use the same functions to access metadata from other projects.

In [75]:
%time gtex_metadata = get_all_submissions('gtex', 'GTEx-v7')

CPU times: user 2.2 s, sys: 608 ms, total: 2.8 s
Wall time: 59.1 s


In [76]:
gtex_metadata.keys()

[u'/v0/submission/gtex/GTEx-v7/_dictionary/exposure',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/carotid_ultrasound_test',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/psychosocial_questionnaire',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/diagnosis',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/demographic',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/blood_pressure_test',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/allele_expression',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/aligned_reads_index',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/read_group',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/physical_activity_questionnaire',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/program',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/exon_expression',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/sample',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/snp_array_variation',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/socio_demo_questionnaire',
 u'/v0/submission/gtex/GTEx-v7/_dictionary/root',
 u'/v0/submissio

In [79]:
print(gtex_metadata['/v0/submission/gtex/GTEx-v7/_dictionary/sample'][0].keys())

['pathology_notes', 'biospecimen_anatomic_site', 'internal_notes', 'submitter_id', 'hours_to_collection', 'prosector_notes', 'id', 'collection_kit', 'biospecimen_anatomic_site_uberon_term', 'sample_type', 'autolysis_score', 'cases.id', 'project_id', 'current_weight', 'composition', 'cases.submitter_id', 'oct_embedded', 'is_ffpe', 'biospecimen_anatomic_site_detail', 'method_of_sample_procurement', 'tissue_type', 'collection_site', 'freezing_method', 'biospecimen_physical_site', 'biospecimen_type', 'type', 'preservation_method', 'biospecimen_anatomic_site_uberon_id', 'hours_to_sample_procurement']
