# Mapping SOP Class Name from SOP Class UID on the MG Series File Node
---
by Eric Giger

Scientific Support Analyst at the Center for Translational Data Science at University of Chicago

August 2023

---
This Jupyter notebook demonstrates how to map the mg_series_file node's `sop_class_uid` property to a `sop_class_name` by using a table of standard SOP Classes:[Table B.5-1. Standard SOP Classes](https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_B.5.html). 


In [None]:
# Import Python Packages and scripts
import pandas as pd
import numpy as np
from pathlib import Path
import sys, os, copy, datetime, shutil

import gen3
from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.index import Gen3Index
from gen3.query import Gen3Query

In [None]:
# who's running the notebook
username = ""

In [None]:
# download and import some custom Python scripts from https://github.com/cgmeyer/gen3sdk-python
wd = "/Users/{}/Documents/Notes/MIDRC/wd".format(username) #set your working dir
os.chdir(wd)
os.system("wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py -O expansion.py")
from expansion import Gen3Expansion


In [None]:
###############
# You can view the SDK code/functions in GitHub: https://github.com/uc-cdis/gen3sdk-python
###############
###############
###############
# Initiate instances of the Gen3 SDK Classes using credentials file downloaded from https://staging.midrc.org/identity
###############
sapi = 'https://staging.midrc.org'
scred = '/Users/{}/Downloads/midrc-staging-credentials.json'.format(username)
sauth = Gen3Auth(sapi, refresh_file=scred) # authentication class
ssub = Gen3Submission(sapi, sauth) # submission class
squery = Gen3Query(sauth) # query class
sexp = Gen3Expansion(sapi,sauth,ssub) # class with some custom scripts
sprojects=sexp.get_project_ids()
###############
###############
# Initiate instances of the Gen3 SDK Classes using credentials file downloaded from https://validatestaging.midrc.org/identity
###############
vsapi = 'https://validatestaging.midrc.org'
vscred = '/Users/{}/Downloads/midrc-validatestaging-credentials.json'.format(username)
vsauth = Gen3Auth(vsapi, refresh_file=vscred) # authentication class
vssub = Gen3Submission(vsapi, vsauth) # submission class
vsquery = Gen3Query(vsauth) # query class
vsexp = Gen3Expansion(vsapi,vsauth,vssub) # class with some custom scripts
vprojects=vsexp.get_project_ids()
###############
###############
projects=sprojects+vprojects

## Prepare the mapping table
---
Get the mapping table from the [uc-cdis/midrc-etl repo](https://github.com/uc-cdis/midrc-etl/tree/master/sop_class_mapping) and reformat (if necessary) to prepare for the SOP class mapping.

Prior to this, you will need to clone the repo using `git clone git@github.com:uc-cdis/midrc-etl.git`

In [None]:
## Set the directory to your copy of the GitHub repo and pull main branch to pull latest updates from GitHub
git_dir = "/Users/{}/Documents/GitHub/uc-cdis/midrc-etl".format(username)
os.chdir(git_dir)
os.system("git checkout master")
os.system("git pull origin master")
sop_dir="{}/sop_class_mapping".format(git_dir)
os.chdir(sop_dir)

In [None]:
## Create the output/working directory
now = datetime.datetime.now()
today = "{}-{}-{}".format(now.year, now.month, now.day)

results_dir = "{}/sop_mapping/results_{}".format(wd,today)
Path(results_dir).mkdir(parents=True, exist_ok=True)

staging_results_dir = "{}/staging_results_{}".format(results_dir,today)
Path(staging_results_dir).mkdir(parents=True, exist_ok=True)

validatestaging_results_dir = "{}/validatestaging_results_{}".format(results_dir,today)
Path(validatestaging_results_dir).mkdir(parents=True, exist_ok=True)

In [None]:
mapping_file = "{}/TableB5-1_Standard_SOP_Classes.tsv".format(sop_dir)
mapping = pd.read_csv(mapping_file,sep='\t',dtype=str) #['Modality', 'StudyDescription', 'LOINC code', 'L-Long Common Name']

shutil.copy2(mapping_file, results_dir) # copy the mapping table version used for this mapping to the results_dir

mapping.drop_duplicates(inplace=True)

## remove any leading/trailing whitespaces in the codes/descriptions
mapping['sop_class_name'] = mapping['sop_class_name'].str.strip()
mapping['sop_class_uid'] = mapping['sop_class_uid'].str.strip()
mapping['iod_specification'] = mapping['iod_specification'].str.strip()
mapping['specialization'] = mapping['specialization'].str.strip()

## Summarize the mapping table
mapping_codes = list(set(mapping['sop_class_uid']))
mapping_names = list(set(mapping['sop_class_name']))

display(mapping)
print("Number of unique SOP Class UIDs in mapping table: {}".format(len(mapping_codes)))
print("Number of unique SOP CLass Names in mapping table: {}".format(len(mapping_names)))



#### While you may notice that there are four properties, one of which is mostly NaN, fear not as we will only need `sop_class_name` and `sop_class_uid` for this process.

In [None]:
mapping[['sop_class_name','sop_class_uid']]
# we could save this to a variable if we'd like

## Download the MG Series File metadata from MIDRC Staging and Validate Staging
---
* Use the [`Gen3Expansion.get_node_tsvs()` SDK function](https://github.com/cgmeyer/gen3sdk-python/blob/389e3945482439ace6e4536e6d0e35c6e48de9c9/expansion/expansion.py#L219) to get all the MG Series File metadata in MIDRC Staging (staging.midrc.org) and ValidateStaging (validatestaging.midrc.org).

* The function `exp.get_node_tsvs()` will return a master dataframe containing all the imaging_study data from the projects specified.

### In this version of the notebook, we're getting five projects: 
Staging:
* Open-R1 (data from RSNA)
* Open-A1 (data from ACR)
* Open-A1_PETAL_REDCORAL (PETAL data from ACR)

ValidateStaging:
* SEQ_Open-R3
* SEQ_Open-A3

In [None]:
os.chdir(results_dir)

# sprojects = sexp.get_project_ids()
sprojects = ['Open-R1','Open-A1','Open-A1_PETAL_REDCORAL']
s_mg_series_file = sexp.get_node_tsvs(node='mg_series_file', overwrite=True, projects=sprojects,outdir=staging_results_dir)

# vsprojects = vsexp.get_project_ids()
vsprojects = ['SEQ_Open-R3','SEQ_Open-A3']
vs_mg_series_file = vsexp.get_node_tsvs(node='mg_series_file', overwrite=True, projects=vsprojects,outdir=validatestaging_results_dir)

In [None]:
# concat the SEQ and OPEN master MG Series File TSVs

msf = pd.concat([s_mg_series_file,vs_mg_series_file], ignore_index=False)
assert(len(st) == (len(sst)+len(vst)))

## Reformat the imaging_series data to prepare for mapping
---


In [None]:
# make a list of the required properties needed to submit the MG Series File metadata + SOP Class UID (we'll get the name from the mapping table)
series_props=['type','submitter_id','md5sum','file_size','file_name','data_type','data_format','data_category','sop_class_uid']

## Make a copy of the master MG Series File table with only data relevant to SOP Class Mapping
mg_series_file = copy.deepcopy(msf[series_props])

## remove any leading/trailing whitespaces
mg_series_file['sop_class_uid'] = mg_series_file['sop_class_uid'].str.strip()


## Identify and drop duplicates
dupes = mg_series_file.loc[mg_series_file.duplicated(keep=False)]
mg_series_file = mg_series_file.drop_duplicates()
display(mg_series_file)

## Do mapping using pandas pd.merge(): 
---

-  Merge `sop_class_name` in `mapping` DataFrame into the `mg_series_file` DataFrame on `sop_class_uid`.


In [None]:
# MDF (Master Data Frame)
mdf = mg_series_file.merge(mapping,on='sop_class_uid',how='left').drop_duplicates()
display(mdf)

## Submit the data to Staging and ValidateStaging
---
Use the Gen3SDK function `Gen3Submission.submit_file()` to update the SOP Class Name property for each of the newly mapped series in `mdf`.

In [None]:
# let's first seperate the mapped series in mdf into open and seq

# OPEN
smdf = mdf.loc[mdf.project_id.isin(sprojects)].reset_index(drop=True)

# SEQ
vsmdf = mdf.loc[mdf.project_id.isin(vsprojects)].reset_index(drop=True)

In [None]:
# I sometimes prefer submitting to ValidateStaging first since I know it will take less time
# Submit the metadata to ValidateStaging

projects = list(set(vsmdf['project_id']))
vsdata = {}
for vspid in projects:
    print("Submitting data to project '{}'.".format(vspid))
    vsdata[vspid] = vsexp.submit_df(df=vsmdf.loc[vsmdf["project_id"]==vspid],project_id=vspid, chunk_size=1000)

In [None]:
# Submit the metadata to Staging

projects = list(set(smdf['project_id']))
sdata = {}
for spid in projects:
    print("Submitting data to project '{}'.".format(vspid))
    sdata[spid] = sexp.submit_df(df=smdf.loc[smdf["project_id"]==spid],project_id=spid, chunk_size=1000)