In [87]:
from pfb.reader import PFBReader
from pfb.writer import PFBWriter
from pfb.base import PFBBase
import pfb.importers.gen3dict as gen3dict

from dictionaryutils import DataDictionary, dictionary
import json

import requests

## Example of an ontology reference in a dictionary

In [88]:
r = requests.get("https://s3.amazonaws.com/dictionary-artifacts/gtexdictionary/4.4.0/schema.json")
j = r.json()

print(json.dumps(j["_terms.yaml"]["biospecimen_anatomic_site"]))

{"description": "Text term that represents the name of the primary disease site of the submitted tumor sample.\n", "termDef": {"term": "Submitted Tumor Sample Primary Anatomic Site", "source": "caDSR", "cde_id": 4742851, "cde_version": 1.0, "term_url": "https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=4742851&version=1.0"}}


## We now create a PFB file using the same dictionary as above
One could create a PFB using the pypfb sdk and running `pfb from -o test_schema.avro dict URL` instead

In [44]:
w = PFBWriter(file_or_path="test_schema.avro")
w.__enter__()

d = DataDictionary(url = "https://s3.amazonaws.com/dictionary-artifacts/gtexdictionary/4.4.0/schema.json")
dictionary.init(d)

records, ontology_references, links = gen3dict._parse_dictionary(d)

# print(ontology_references)

metadata = gen3dict._get_ontology_references(ontology_references, links)

w.set_schema(records)
w.set_metadata(metadata)
w.write()

## Now we can read the data from the PFB that we just created and look at the metadata

In [84]:
r = PFBReader("test_schema.avro")
# r = PFBReader("bdc_schema.avro")
r = r.__enter__()

In [85]:
print(json.dumps(r.metadata))

{"nodes": [{"name": "root", "ontology_reference": "", "values": {}, "links": [], "properties": []}, {"name": "data_release", "ontology_reference": "", "values": {}, "links": [{"multiplicity": "MANY_TO_ONE", "dst": "root", "name": "roots"}], "properties": []}, {"name": "reference_file_index", "ontology_reference": "", "values": {}, "links": [{"multiplicity": "MANY_TO_MANY", "dst": "reference_file", "name": "reference_files"}, {"multiplicity": "MANY_TO_MANY", "dst": "core_metadata_collection", "name": "core_metadata_collections"}], "properties": []}, {"name": "subject", "ontology_reference": "", "values": {}, "links": [{"multiplicity": "MANY_TO_MANY", "dst": "study", "name": "studies"}], "properties": []}, {"name": "submitted_unaligned_reads", "ontology_reference": "", "values": {}, "links": [{"multiplicity": "MANY_TO_MANY", "dst": "read_group", "name": "read_groups"}, {"multiplicity": "MANY_TO_MANY", "dst": "core_metadata_collection", "name": "core_metadata_collections"}], "properties":

## Finally we will print out all of the external ontology references that exist in this PFB

In [86]:
for node in r.metadata["nodes"]:
    for prop in node["properties"]:
        if prop["ontology_reference"] != "" and "term_url" in prop["values"]:
            print("Dictionary Definiton: ", prop["name"])
            print("Ontology Reference: ", prop["ontology_reference"])
            print("Term URL: ", prop["values"]["term_url"])

Dictionary Definiton:  biospecimen_anatomic_site
Ontology Reference:  Submitted Tumor Sample Primary Anatomic Site
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=4742851&version=1.0
Dictionary Definiton:  composition
Ontology Reference:  Biospecimen Cellular Composition Type
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432591&version=1.0
Dictionary Definiton:  current_weight
Ontology Reference:  Tissue Sample Current Weight Milligram Value
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432606&version=1.0
Dictionary Definiton:  freezing_method
Ontology Reference:  Tissue Sample Freezing Method Type
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432607&version=1.0
Dictiona