In [1]:
from pfb.reader import PFBReader
from pfb.writer import PFBWriter
from pfb.base import PFBBase
import pfb.importers.gen3dict as gen3dict
import pfb.importers.json as pfbJSON

from dictionaryutils import DataDictionary, dictionary
import json

import requests

## Example of an ontology reference in a dictionary

In [2]:
r = requests.get("https://s3.amazonaws.com/dictionary-artifacts/gtexdictionary/4.4.0/schema.json")
j = r.json()

print(json.dumps(j["_terms.yaml"]["biospecimen_anatomic_site"]))

{"description": "Text term that represents the name of the primary disease site of the submitted tumor sample.\n", "termDef": {"term": "Submitted Tumor Sample Primary Anatomic Site", "source": "caDSR", "cde_id": 4742851, "cde_version": 1.0, "term_url": "https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=4742851&version=1.0"}}


## We now create a PFB file using the same dictionary as above
One could create a PFB using the pypfb sdk and running `pfb from -o test_schema.avro dict URL` instead

In [3]:
w = PFBWriter(file_or_path="test_schema.avro")
w.__enter__()

d = DataDictionary(url = "https://s3.amazonaws.com/dictionary-artifacts/gtexdictionary/4.4.0/schema.json")
dictionary.init(d)

records, ontology_references, links = gen3dict._parse_dictionary(d)

metadata = gen3dict._get_ontology_references(ontology_references, links)

w.set_schema(records)
w.set_metadata(metadata)
w.write()

In [4]:
for node in metadata["nodes"]:
    for prop in node["properties"]:
        if prop["ontology_reference"] != "" and "term_url" in prop["values"]:
            print("Dictionary Definiton: ", prop["name"])
            print("Ontology Reference: ", prop["ontology_reference"])
            print("Term URL: ", prop["values"]["term_url"])
            print("CDE ID: ",  prop["values"]["cde_id"])

Dictionary Definiton:  biospecimen_anatomic_site
Ontology Reference:  Submitted Tumor Sample Primary Anatomic Site
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=4742851&version=1.0
CDE ID:  4742851
Dictionary Definiton:  composition
Ontology Reference:  Biospecimen Cellular Composition Type
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432591&version=1.0
CDE ID:  5432591
Dictionary Definiton:  current_weight
Ontology Reference:  Tissue Sample Current Weight Milligram Value
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432606&version=1.0
CDE ID:  5432606
Dictionary Definiton:  freezing_method
Ontology Reference:  Tissue Sample Freezing Method Type
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=Elemen

## Now we can read the data from the PFB that we just created and look at the metadata

In [5]:
r = PFBReader("test_schema.avro")
r = r.__enter__()

In [6]:
print(json.dumps(r.metadata))

{"nodes": [{"name": "root", "ontology_reference": "", "values": {}, "links": [], "properties": []}, {"name": "data_release", "ontology_reference": "", "values": {}, "links": [{"multiplicity": "MANY_TO_ONE", "dst": "root", "name": "roots"}], "properties": []}, {"name": "reference_file_index", "ontology_reference": "", "values": {}, "links": [{"multiplicity": "MANY_TO_MANY", "dst": "reference_file", "name": "reference_files"}, {"multiplicity": "MANY_TO_MANY", "dst": "core_metadata_collection", "name": "core_metadata_collections"}], "properties": []}, {"name": "subject", "ontology_reference": "", "values": {}, "links": [{"multiplicity": "MANY_TO_MANY", "dst": "study", "name": "studies"}], "properties": []}, {"name": "submitted_unaligned_reads", "ontology_reference": "", "values": {}, "links": [{"multiplicity": "MANY_TO_MANY", "dst": "read_group", "name": "read_groups"}, {"multiplicity": "MANY_TO_MANY", "dst": "core_metadata_collection", "name": "core_metadata_collections"}], "properties":

## Finally we will print out all of the external ontology references that exist in this PFB

In [7]:
for node in r.metadata["nodes"]:
    for prop in node["properties"]:
        if prop["ontology_reference"] != "" and "term_url" in prop["values"]:
            print("Dictionary Definiton: ", prop["name"])
            print("Ontology Reference: ", prop["ontology_reference"])
            print("Term URL: ", prop["values"]["term_url"])
            print("CDE ID: ",  prop["values"]["cde_id"])

Dictionary Definiton:  biospecimen_anatomic_site
Ontology Reference:  Submitted Tumor Sample Primary Anatomic Site
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=4742851&version=1.0
CDE ID:  4742851
Dictionary Definiton:  composition
Ontology Reference:  Biospecimen Cellular Composition Type
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432591&version=1.0
CDE ID:  5432591
Dictionary Definiton:  current_weight
Ontology Reference:  Tissue Sample Current Weight Milligram Value
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432606&version=1.0
CDE ID:  5432606
Dictionary Definiton:  freezing_method
Ontology Reference:  Tissue Sample Freezing Method Type
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=Elemen

## Now we add data into our pfb file
This is essentially running the command `pfb from -o test_data.avro json -s test_schema.avro --program DEV --project TEST ./bdc-sample`

In [8]:
writer = PFBWriter(file_or_path="test_data.avro")
writer.__enter__()

with PFBReader("test_schema.avro") as reader:
    writer.copy_schema(reader)

writer.write(pfbJSON._from_json(writer.metadata, "./bdc-sample", "DEV", "TEST"))

[34m1/31: [0m[37mdemographic[0m
[34m2/31: [0m[37mproject[0m
[34m3/31: [0m[37mexposure[0m
[34m4/31: [0m[37msleep_test_file[0m
[34m5/31: [0m[37mimaging_file[0m
[34m6/31: [0m[37mgermline_variation_index[0m
[34m7/31: [0m[37mpublication[0m
[34m8/31: [0m[37macknowledgement[0m
[34m9/31: [0m[37maligned_reads_index[0m
[34m10/31: [0m[37mmedical_history[0m
[34m11/31: [0m[37mimaging_file_reference[0m
[34m12/31: [0m[37mblood_pressure_test[0m
[34m13/31: [0m[37msubmitted_unaligned_reads[0m
[34m14/31: [0m[37mlab_result[0m
[34m15/31: [0m[37mstudy[0m
[34m16/31: [0m[37mmedication[0m
[34m17/31: [0m[37maliquot[0m
[34m18/31: [0m[37msimple_germline_variation[0m
[34m19/31: [0m[37mgermline_mutation_calling_workflow[0m
[34m20/31: [0m[37msubmitted_aligned_reads[0m
[34m21/31: [0m[37malignment_cocleaning_workflow[0m
[34m22/31: [0m[37melectrocardiogram_test[0m
[34m23/31: [0m[37malignment_workflow[0m
[34m24/31: [0m[37mread

## Now we want to show that we can translate the ontology references to CDISC

In [9]:
# first we set up a translation dictionary of NCIt CDEs to CDISC ids
ontology_translation = {}
ontology_translation["649"] = ["sendct-2021-12-17.C90012.C25209", "MEASUREMENT", "https://library.cdisc.org/browser/#/mdr/ct/2021-12-17/packages/sendct-2021-12-17/codelists/C90012/terms/C25209"]
ontology_translation["4973892"] = ["sdtmct-2021-12-17.C67153.C16358", "Body Mass Index", "https://library.cdisc.org/browser/#/mdr/ct/2021-12-17/packages/sdtmct-2021-12-17/codelists/C67153/terms/C16358"]
ontology_translation["2192217"] = ["glossaryct-2021-12-17.C67497.C16564", "ethnicity", "https://library.cdisc.org/browser/#/mdr/ct/2021-12-17/packages/glossaryct-2021-12-17/codelists/C67497/terms/C16564"]
ontology_translation["2192199"] = ["glossaryct-2021-12-17.C67497.C17049", "race", "https://library.cdisc.org/browser/#/mdr/ct/2021-12-17/packages/glossaryct-2021-12-17/codelists/C67497/terms/C17049"]
ontology_translation["5278775"] = ["sdtmct-2021-06-25.C124300.C63637", "RIN", "https://library.cdisc.org/browser/#/mdr/ct/2021-06-25/packages/sdtmct-2021-06-25/codelists/C124300/terms/C63637"]

In [10]:
w_out = PFBWriter(file_or_path="ontology_schema.avro")
w_out.__enter__()

d = DataDictionary(url = "https://s3.amazonaws.com/dictionary-artifacts/gtexdictionary/4.4.0/schema.json")
dictionary.init(d)

records, ontology_references, links = gen3dict._parse_dictionary(d)

metadata_new = gen3dict._get_ontology_references(ontology_references, links)

node_count = 0
for node in metadata_new["nodes"]:
    prop_count = 0
    for prop in node["properties"]:
        if prop["ontology_reference"] != "" and "term_url" in prop["values"]:
            if prop["values"]["cde_id"] in ontology_translation:
                metadata_new["nodes"][node_count]["properties"][prop_count]["values"]["term_url"] = ontology_translation[prop["values"]["cde_id"]][2]
                metadata_new["nodes"][node_count]["properties"][prop_count]["ontology_reference"] = ontology_translation[prop["values"]["cde_id"]][1]
                metadata_new["nodes"][node_count]["properties"][prop_count]["values"]["cde_id"] = ontology_translation[prop["values"]["cde_id"]][0]
        prop_count += 1
    node_count += 1


In [11]:
w_out.set_schema(records)
w_out.set_metadata(metadata_new)
w_out.write()

In [12]:
r_translation = PFBReader("ontology_schema.avro")
r_translation = r_translation.__enter__()

## We can see our new translations in the metadata

In [13]:
for node in r_translation.metadata["nodes"]:
    for prop in node["properties"]:
        if prop["ontology_reference"] != "" and "term_url" in prop["values"]:
            print("Dictionary Definiton: ", prop["name"])
            print("Ontology Reference: ", prop["ontology_reference"])
            print("Term URL: ", prop["values"]["term_url"])
            print("CDE ID: ",  prop["values"]["cde_id"])

Dictionary Definiton:  biospecimen_anatomic_site
Ontology Reference:  Submitted Tumor Sample Primary Anatomic Site
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=4742851&version=1.0
CDE ID:  4742851
Dictionary Definiton:  composition
Ontology Reference:  Biospecimen Cellular Composition Type
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432591&version=1.0
CDE ID:  5432591
Dictionary Definiton:  current_weight
Ontology Reference:  Tissue Sample Current Weight Milligram Value
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=ElementDetailsGroup&publicId=5432606&version=1.0
CDE ID:  5432606
Dictionary Definiton:  freezing_method
Ontology Reference:  Tissue Sample Freezing Method Type
Term URL:  https://cdebrowser.nci.nih.gov/CDEBrowser/search?elementDetails=9&FirstTimer=0&PageId=Elemen

## Now we add the data in like before

In [14]:
writer = PFBWriter(file_or_path="ontology_data.avro")
writer.__enter__()

with PFBReader("ontology_schema.avro") as reader:
    writer.copy_schema(reader)

writer.write(pfbJSON._from_json(writer.metadata, "./bdc-sample", "DEV", "TEST"))

[34m1/31: [0m[37mdemographic[0m
[34m2/31: [0m[37mproject[0m
[34m3/31: [0m[37mexposure[0m
[34m4/31: [0m[37msleep_test_file[0m
[34m5/31: [0m[37mimaging_file[0m
[34m6/31: [0m[37mgermline_variation_index[0m
[34m7/31: [0m[37mpublication[0m
[34m8/31: [0m[37macknowledgement[0m
[34m9/31: [0m[37maligned_reads_index[0m
[34m10/31: [0m[37mmedical_history[0m
[34m11/31: [0m[37mimaging_file_reference[0m
[34m12/31: [0m[37mblood_pressure_test[0m
[34m13/31: [0m[37msubmitted_unaligned_reads[0m
[34m14/31: [0m[37mlab_result[0m
[34m15/31: [0m[37mstudy[0m
[34m16/31: [0m[37mmedication[0m
[34m17/31: [0m[37maliquot[0m
[34m18/31: [0m[37msimple_germline_variation[0m
[34m19/31: [0m[37mgermline_mutation_calling_workflow[0m
[34m20/31: [0m[37msubmitted_aligned_reads[0m
[34m21/31: [0m[37malignment_cocleaning_workflow[0m
[34m22/31: [0m[37melectrocardiogram_test[0m
[34m23/31: [0m[37malignment_workflow[0m
[34m24/31: [0m[37mread