### This is the MIDRC Pre-Ingestion QC Checklist. 
---
The purpose of the notebook is to do some basic quality checks of batches of data received from data contributors in MIDRC prior to beginning work on ingestion, so that we can notify data contributors of any issues with their data submissions sooner than later.

Chris Meyer, PhD <br>
PlanX Manager of Data and User Services <br>
April 2023 <br>

In [1]:
wd = "/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/"
pid = "TCIA-COVID-19-NY-SBU"

#batch="RSNA_20220812"
batch="RSNA_20230131"
batch="RSNA_20230214"
batch="RSNA_20230303"

batch_dir = "{}{}".format(wd,batch)

print(batch_dir)

/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303


In [2]:
"""
0. Prepare Python environment
"""

import pandas as pd
import sys, os
import gen3
from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.index import Gen3Index
from gen3.query import Gen3Query



In [3]:
!rm expansion.py*
!wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py
%run ./expansion.py


--2023-04-14 14:38:16--  https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 199943 (195K) [text/plain]
Saving to: ‘expansion.py’


2023-04-14 14:38:16 (3.62 MB/s) - ‘expansion.py’ saved [199943/199943]



In [4]:
# ## override the github file with local copy; 
# ## comment this out when running outside cgmeyer's laptop
# git_dir='/Users/christopher/Documents/GitHub'
# sdk_dir='/cgmeyer/gen3sdk-python'
# sys.path.insert(1, '{}{}'.format(git_dir,sdk_dir))
# from expansion import Gen3Expansion
# %run /Users/christopher/Documents/GitHub/cgmeyer/gen3sdk-python/expansion/expansion.py


In [5]:
# initialize the SDK classes
sapi = 'https://staging.midrc.org'
scred = '/Users/christopher/Downloads/midrc-staging-credentials.json' # change this to your credentials location
sauth = Gen3Auth(sapi, refresh_file=scred)
ssub = Gen3Submission(sapi, sauth)
sindex = Gen3Index(sauth)
squery = Gen3Query(sauth)
sexp = Gen3Expansion(sapi,sauth,ssub)
sexp.get_project_ids()


Getting all project_ids you have access to in the data commons.
['Open-A1', 'Open-A1_PETAL_REDCORAL', 'Open-R1', 'TCIA-COVID-19-AR', 'TCIA-COVID-19-NY-SBU', 'TCIA-COVID-19_CT_Images', 'TCIA-RICORD']


['Open-A1',
 'Open-A1_PETAL_REDCORAL',
 'Open-R1',
 'TCIA-COVID-19-AR',
 'TCIA-COVID-19-NY-SBU',
 'TCIA-COVID-19_CT_Images',
 'TCIA-RICORD']


## 1. Fetch the batch metadata TSVs and clinical/image manifests
---
Run the following in linux/unix shell:

* a. Pull data from AWS bucket to utilityvm.midrc.csoc, e.g.:
```
aws s3 sync s3://external-data-midrc-replication/replicated-data-acr/RSNA_20220812/ RSNA_20220812/ --exclude "*" --include "*.tsv"
```
* b. Sync the data locally for submission, or can run this notebook directly in the utility VM via ipython shell, e.g.:
```
wd="/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3"
batch="RSNA_20230303"
rsync -rP utilityvm.midrc.csoc:/home/ubuntu/download/${batch} ${wd}
```



## 2. Sort the TSVs into manifests, submission TSVs, and supplemental/other
---
Provide the batch name ("batch") and the directory where the batch TSVs are located ("batch_dir")



In [6]:
batch_tsvs = sexp.sort_batch_tsvs(batch,batch_dir)
batch_tsvs


/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/dx_series_RSNA_20230303.tsv
['dx_series']
/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/imaging_study_RSNA_20230303.tsv
['imaging_study']
/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/ct_series_RSNA_20230303.tsv
['ct_series']
/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/image_manifest_RSNA_20230303.tsv
/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/clinical_manifest_RSNA_20230303.tsv
/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/visit_RSNA_20230303.tsv
['visit']
/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/measurement_RSNA_20230303.tsv
['measurement']
/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/cr_series_RSNA_20230303.tsv
['cr_series']
/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/case_RSNA_20230303.tsv
['case']


{'batch': 'RSNA_20230303',
 'node_tsvs': {'dx_series_file': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/dx_series_RSNA_20230303.tsv',
  'imaging_study': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/imaging_study_RSNA_20230303.tsv',
  'ct_series_file': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/ct_series_RSNA_20230303.tsv',
  'visit': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/visit_RSNA_20230303.tsv',
  'measurement': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/measurement_RSNA_20230303.tsv',
  'cr_series_file': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/cr_series_RSNA_20230303.tsv',
  'case': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/case_RSNA_20230303.tsv'},
 'image_manifests': ['/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/image_manifest_RSNA_20230303.tsv'],
 'clinical_manifests': ['/

In [7]:
## Display batch TSV information

if len(batch_tsvs["other_tsvs"]) > 0:
    print("Other TSVs not matched with data model:")
    display(batch_tsvs["other_tsvs"])
if len(batch_tsvs["nomatch_tsvs"]) > 0:
    print("TSVs that don't match regex for finding TSVs:")
    display(batch_tsvs["nomatch_tsvs"])

print("Clinical manifests:")
display(batch_tsvs["clinical_manifests"])
print("Image manifests:")
display(batch_tsvs["image_manifests"])
print("Submission TSVs:")
display(batch_tsvs["node_tsvs"])


Clinical manifests:


['/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/clinical_manifest_RSNA_20230303.tsv']

Image manifests:


['/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/image_manifest_RSNA_20230303.tsv']

Submission TSVs:


{'dx_series_file': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/dx_series_RSNA_20230303.tsv',
 'imaging_study': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/imaging_study_RSNA_20230303.tsv',
 'ct_series_file': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/ct_series_RSNA_20230303.tsv',
 'visit': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/visit_RSNA_20230303.tsv',
 'measurement': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/measurement_RSNA_20230303.tsv',
 'cr_series_file': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/cr_series_RSNA_20230303.tsv',
 'case': '/Users/christopher/Documents/Notes/MIDRC/data/ssot-s3/RSNA_20230303/case_RSNA_20230303.tsv'}

## Main Script
---


In [8]:
# Main script

# for active development, re-import and re-initialize the Gen3Expansion class; 
# comment out next two lines if you're not developing
%run /Users/christopher/Documents/GitHub/cgmeyer/gen3sdk-python/expansion/expansion.py
sexp = Gen3Expansion(sapi,sauth,ssub)

# set some vars
report = {}
dd = ssub.get_dictionary_all() # Get the data dictionary from staging

exclude_props = [ # submitters don't provide these properties, so remove them from QC check
    # case props not provided by submitters
    "datasets.submitter_id",
    "token_record_id",
    "linked_external_data",
    #series_file props not provided by submitters    
    "file_name",
    "md5sum",
    "file_size",
    "object_id",
    "storage_urls",
    "core_metadata_collections.submitter_id",
    "core_metadata_collections",
    "associated_ids",
    #imaging_study props not provided by submitters
    "loinc_code",
    "loinc_system",
    "loinc_contrast",
    "loinc_long_common_name",
    "loinc_method",
    "days_from_study_to_neg_covid_test",
    "days_from_study_to_pos_covid_test"
]


# get list of unique case_ids (case.submitter_id) from case TSV
df = pd.read_csv(batch_tsvs["node_tsvs"]["case"],sep='\t',header=0,dtype=str)
cids = list(set(df.submitter_id))
print("Found {} unique submitter_ids in the case TSV.".format(len(cids)))


# QC submission TSVs
for node in list(batch_tsvs["node_tsvs"]):
    
    # read in the node TSV
    print("\n{}".format(node))
    report[node] = []
    df = pd.read_csv(batch_tsvs["node_tsvs"][node],sep='\t',dtype=str)

    # check case IDs
    errors = sexp.check_case_ids(df,node,cids)
    report[node]+=errors

    # check that the 'type' column (node ID) is present and correct
    errors = sexp.check_type_field(df,node)
    report[node]+=errors

    # check for submitter_id column completeness and uniqueness
    errors = sexp.check_submitter_id(df,node)
    report[node]+=errors
    
    # check for presence and completeness of link columns
    errors = sexp.check_links(df,node,dd)
    report[node]+=errors

    # check for special characters in the TSV
    errors = sexp.check_special_chars(node,batch_tsvs)
    report[node]+=errors
    
    # check for required property completeness
    errors = sexp.check_required_props(df,node,dd,exclude_props)
    report[node]+=errors
    
    # check for missing data in non-requred properties; check if TSV has columns with all null values
    errors = sexp.check_completeness(df,node)
    report[node]+=errors
    
    # Check for correct property value types; i.e., number properties are numbers, strings are strings, etc.
    errors = sexp.check_prop_types(df,node,dd,exclude_props)
    report[node]+=errors

    if len(report[node]) == 0:
        msg = "No errors for '{}' TSV".format(node)
        report[node] = msg

# QC image manifest:
idf = sexp.read_image_manifests(batch_tsvs["image_manifests"])
errors = sexp.check_image_manifest(idf,cids)
if len(errors) > 0:
    report["image_manifest"] = errors


Found 2406 unique submitter_ids in the case TSV.

dx_series_file
'dx_series_file' TSV has all NA values for these properties: ['contrast_bolus_agent']

imaging_study
'cases.submitter_id' link header not found in 'imaging_study' TSV.
'study_location' property in dictionary is not in the 'imaging_study' TSV.

ct_series_file
'ct_series_file' TSV has all NA values for these properties: ['acquisition_type']

visit
'cases.submitter_id' link header not found in 'visit' TSV.

measurement
'conditions.submitter_id' link header not found in 'measurement' TSV.
'procedures.submitter_id' link header not found in 'measurement' TSV.
'cases.submitter_id' link header not found in 'measurement' TSV.

cr_series_file
'cr_series_file' TSV has all NA values for these properties: ['contrast_bolus_agent', 'spatial_resolution']

case
'datasets.submitter_id' link header not found in 'case' TSV.
'case' TSV has all NA values for these properties: ['ventilator_indicator']
'treatment_info' property in dictionary is 

In [9]:
#display(errors)
display(report)

{'dx_series_file': ["'dx_series_file' TSV has all NA values for these properties: ['contrast_bolus_agent']"],
 'imaging_study': ["'cases.submitter_id' link header not found in 'imaging_study' TSV.",
  "'study_location' property in dictionary is not in the 'imaging_study' TSV."],
 'ct_series_file': ["'ct_series_file' TSV has all NA values for these properties: ['acquisition_type']"],
 'visit': ["'cases.submitter_id' link header not found in 'visit' TSV."],
 'measurement': ["'conditions.submitter_id' link header not found in 'measurement' TSV.",
  "'procedures.submitter_id' link header not found in 'measurement' TSV.",
  "'cases.submitter_id' link header not found in 'measurement' TSV."],
 'cr_series_file': ["'cr_series_file' TSV has all NA values for these properties: ['contrast_bolus_agent', 'spatial_resolution']"],
 'case': ["'datasets.submitter_id' link header not found in 'case' TSV.",
  "'case' TSV has all NA values for these properties: ['ventilator_indicator']",
  "'treatment_inf

In [10]:
## Check completeness of submission and generate report TSV
s = sexp.summarize_new_batch(batch_tsvs=batch_tsvs,dd=dd,bin_limit=10)


dx_series_file.contrast_bolus_agent
dx_series_file.detector_type
	'dx_series_file.detector_type'                                                                                                                                                                         dx_series_file.image_type
	'dx_series_file.image_type'                                                                                                                                                                            dx_series_file.imager_pixel_spacing
	'dx_series_file.imager_pixel_spacing'                                                                                                                                                                  dx_series_file.lossy_image_compression
	'dx_series_file.lossy_image_compression'                                                                                                                                                               dx_series_file.manufacturer
	'dx_s

	'measurement.test_result_text'                                                                                                                                                                         measurement.test_name
	'measurement.test_name'                                                                                                                                                                                cr_series_file.contrast_bolus_agent
cr_series_file.detector_type
	'cr_series_file.detector_type'                                                                                                                                                                         cr_series_file.image_type
	'cr_series_file.image_type'                                                                                                                                                                            cr_series_file.imager_pixel_spacing
	'cr_series_file.imager_pixel_spacing'             

In [11]:
list(s)

['report', 'nn_props', 'null_props', 'nn_nodes', 'null_nodes']

In [12]:
display(s['report'])

print("\nNull properties:")
display(s['null_props'])
print("\nNull Nodes:")
display(s['null_nodes'])

Unnamed: 0,node,property,type,N,nn,null,perc_null,all_null,min,max,median,mean,stdev,outliers,bin_number,bins
0,case,age_at_index,number,2406,2356,50,0.020781,False,0.0,89.0,59.0,53.679117,21.660078,[],,
0,case,age_at_index_gt89,enum,2406,2406,0,0.000000,False,,,,,,,2.0,"[(No, 2356), (Yes, 50)]"
0,case,country_of_residence,string,2406,2406,0,0.000000,False,,,,,,,1.0,"[(US, 2406)]"
0,case,covid19_positive,enum,2406,2406,0,0.000000,False,,,,,,,2.0,"[(No, 1862), (Yes, 544)]"
0,case,ethnicity,enum,2406,2406,0,0.000000,False,,,,,,,3.0,"[(Not Hispanic or Latino, 2257), (Not Reported..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,case,ventilator_indicator,boolean,2406,0,2406,1.000000,True,,,,,,,,
0,cr_series_file,contrast_bolus_agent,array,2798,0,2798,1.000000,True,,,,,,,,
0,cr_series_file,spatial_resolution,number,2798,0,2798,1.000000,True,,,,,,,,
0,ct_series_file,acquisition_type,enum,4102,0,4102,1.000000,True,,,,,,,,



Null properties:


['case.ventilator_indicator',
 'cr_series_file.contrast_bolus_agent',
 'cr_series_file.spatial_resolution',
 'ct_series_file.acquisition_type',
 'dx_series_file.contrast_bolus_agent']


Null Nodes:


['condition',
 'core_metadata_collection',
 'dataset',
 'medication',
 'mg_series_file',
 'mr_series_file',
 'nm_series_file',
 'observation',
 'procedure',
 'pt_series_file',
 'rf_series_file',
 'supplementary_file',
 'us_series_file',
 'xa_series_file']

In [None]:
## Dry submit; probably not necessary, but interesting to know this is an option
for node in nodes:
    print(node)
    check_dry_submit(node)
