# Calculate and submit days_from_study_to_pos/neg_covid_test
---
by Chris Meyer, PhD

Manager of Data and User Services at the Center for Translational Data Science at University of Chicago

October 2024

---
This Jupyter notebook demonstrates how to calculate the time interval in days between each of a patient's imaging studies and each of their positive/negative COVID-19 test results, which are the following properties on the imaging_study node in the [MIDRC data dictionary](https://github.com/uc-cdis/midrc_dictionary/blob/deae581f0fb8b9ae5add1458d7882e189ba97af6/gdcdictionary/schemas/imaging_study.yaml#L61):
* days_from_study_to_neg_covid_test
* days_from_study_to_pos_covid_test

# Install some Python packages:

In [None]:
# # ## May need to install some Python packages

# !pip install --upgrade pip
# !pip install --upgrade gen3


# Import Python Packages and scripts

In [None]:
# Import Python Packages and scripts
import pandas as pd
import sys, os, copy, datetime

import gen3
from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.index import Gen3Index
from gen3.query import Gen3Query


In [None]:
# Set your working directory and change to it
temp_dir = "/Users/christopher/Documents/Notes/MIDRC/temporal"
os.chdir(temp_dir)


In [None]:
# download and import some custom Python scripts from https://github.com/cgmeyer/gen3sdk-python
os.system("wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py")
from expansion import Gen3Expansion


In [None]:
sapi = 'https://staging.midrc.org'
scred = '/Users/christopher/Downloads/midrc-staging-credentials.json'
sauth = Gen3Auth(sapi, refresh_file=scred)
ssub = Gen3Submission(sapi, sauth)
sindex = Gen3Index(sauth)
squery = Gen3Query(sauth)
sexp = Gen3Expansion(sapi,sauth,ssub)
spids = sexp.get_project_ids()


In [None]:
vsapi = "https://validatestaging.midrc.org"
vscred = '/Users/christopher/Downloads/midrc-validatestaging-credentials.json'
vsauth = Gen3Auth(vsapi, refresh_file=vscred)
vssub = Gen3Submission(vsapi, vsauth)
vsquery = Gen3Query(vsauth)
vsexp = Gen3Expansion(vsapi,vsauth,vssub)
vpids = vsexp.get_project_ids()


## Export metadata using submission API
---
Here we'll utilize the MIDRC submission API (the "sheepdog" Gen3 service) to export all the imaging study and measurement (COVID-19 tests) data using the ["Gen3Expansion.get_node_tsvs" function](https://github.com/cgmeyer/gen3sdk-python/blob/2aecc6575b22f9cca279b650914971dd6723a2ce/expansion/expansion.py#L219), which is a wrapper to export and merge all the records in a node across each project in the data commons using the [Gen3SDK](https://github.com/uc-cdis/gen3sdk-python/) function [Gen3Submission.export_node()](https://github.com/uc-cdis/gen3sdk-python/blob/5d7b5270ff11cf7037f211cf01e410d8e73d6b84/gen3/submission.py#L361).

In [None]:
# Export all the records in the imaging_study node for specified projects in both staging and validatestaging
# Make sure to use the "overwrite=True" option so you're working with the newest version of the data.
# This function will return a "master" dataframe with merged data from all projects; also saves TSVs in "outdir" directory
sst = sexp.get_node_tsvs(projects=spids, node="imaging_study", overwrite=True, outdir="staging_tsvs") # 181077, 178843
display(len(sst))
vst = vsexp.get_node_tsvs(projects=vpids, node="imaging_study", overwrite=True, outdir="validatestaging_tsvs") # 41925, 41048
display(len(vst))

In [None]:
# Now export all the data in the measurement node, which is used to store the COVID test data
sm = sexp.get_node_tsvs(projects=spids, node='measurement', overwrite=True, outdir="staging_tsvs")
vm = vsexp.get_node_tsvs(projects=vpids, node='measurement', overwrite=True, outdir="validatestaging_tsvs")
display(len(sm)) # 188294
display(len(vm)) # 48981


In [None]:
# Concatenate all staging and validatestaging imaging study data
ast = pd.concat([sst,vst])
display(len(ast))

# Concatenate all staging and validatestaging measurement data
am = pd.concat([sm,vm])
display(len(am))

In [None]:
## Drop studies with null days_to_study
sdf = copy.deepcopy(ast.loc[~ast['days_to_study'].isna()]) # 204371
display(len(sdf))

In [None]:
# Filter out measurements with null "test_days_from_index" and those without a test_name of "COVID-19", and those with a test value that's not Negative or Positive
mdf = copy.deepcopy(am.loc[(~am['test_days_from_index'].isna()) & (am['test_name']=='COVID-19') & (am['test_result_text'].isin(['Positive','Negative']))])
display(len(mdf))


In [None]:
# Get project_ids that have both measurement and study non-null data
stpids = list(set(sdf['project_id']))
display(stpids)

mpids = list(set(mdf['project_id']))
display(mpids)

pids = list(set(mpids) & set(stpids))
display(pids)

In [None]:
# Subset measurement and study dataframes to only projects with non-null data
sdf = sdf.loc[sdf['project_id'].isin(pids)]
mdf = mdf.loc[mdf['project_id'].isin(pids)]
print("{} imaging studies out of {} total have matching non-null measurement temporal data in the same projects.".format(len(sdf),len(ast)))
print("{} measurements out of {} total have matching non-null imaging_study temporal data in the same projects.".format(len(mdf),len(am)))

In [None]:
## Get case IDs with both measurement and study non-null data 
cases = list(set(mdf['cases.submitter_id']) & set(sdf['cases.submitter_id'])) # intersection of case IDs in imaging study and measurement nodes
print("Both imaging study and measurement temporal data available for:\n\t{} case IDs with\n\t\t{} imaging studies and\n\t\t{} measurements.".format(len(cases),len(sdf),len(mdf)))


In [None]:
# Subset measurement and study dataframes to only matching case IDs
sdf = sdf.loc[sdf['cases.submitter_id'].isin(cases)]
print("{} imaging studies out of {} total have matching measurement temporal data based on matching project and case IDs.".format(len(sdf),len(ast)))

mdf = mdf.loc[mdf['cases.submitter_id'].isin(cases)]
print("{} measurements out of {} total have matching imaging study temporal data based on matching project and case IDs.".format(len(mdf),len(am)))


In [None]:
# Group the measurement test_days_from_index based on the test_result_text
mdf = mdf.groupby(['cases.submitter_id','test_result_text']).test_days_from_index.apply(sorted).reset_index()
display(mdf)


In [None]:
# Get a list of "test_days_from_index" for each case ID based on value of "test_result_text"
mdf = mdf.pivot(index='cases.submitter_id', columns='test_result_text', values='test_days_from_index').reset_index()
display(mdf)


In [None]:
tdf = sdf.merge(mdf, how='left', on='cases.submitter_id')
display(len(sdf))
display(len(tdf))

## Calculate the days from each imaging study to each COVID-19 test.
---
Now that we have the temporal data for imaging studies and COVID-19 tests in a single DataFrame for all cases in MIDRC for which this data is provided, we can calculate the number of days between each COVID-19 test and each imaging study, which we'll call `days_from_study_to_test`.

* Note: In MIDRC, a negative "days to XYZ" indicates that the event "XYZ" took place that many days prior to the index event, while a positive "days to" indicates the number of days since the index event. For example, a "days_to_study" of "-10" indicates that the imaging study was performed 10 days *before* the index event. A value of "365" indicates the imaging study took place one year *after* the index event. 
* Note: The `index_event` property is on the `case` node and is often the date of the first imaging exam or the date of the first COVID-19 test; all other temporal "days_to" or "days_from_" properties are in relation to that index event.

For `days_from_study_to_test`, we expect a positive value if the test was performed after the study, as the test date is forward in time in relation to the study, and a negative value if the test was performed prior to the study, as the test is backwards in time in relation to the study.

- So, if `test_days_from_index` is `1` and `days_to_study` is `4`, the `days_from_study_to_test` should be `-3`, which means the COVID test took place 3 days before the imaging study. 
- If the COVID test is on day 4 and the imaging study is on day 1, then the `days_from_study_to_test` is `3`, meaning the COVID test took place 3 days after the imaging study.


In [None]:
"""Define a function for calculating the list of values given:
    - days_to_study (int): for an imaging study and 
    - days_to_tests (list of int): which is a list of "test_days_from_index" for each of the patient's COVID-19 tests.
    Returns the list "days_from_study_to_tests", which is the list of the difference between each COVID test's "test_days_from_index" and the study's "days_to_study"
"""
def get_days_from_study_to_tests(days_to_tests, days_to_study):
    days_from_study_to_tests = []
    if hasattr(days_to_tests, '__iter__'):
        for days_to_test in days_to_tests:
            days_from_study_to_tests.append(int(days_to_test - days_to_study))
        days_from_study_to_tests = list(set(days_from_study_to_tests))
        days_from_study_to_tests = sorted(days_from_study_to_tests, reverse=True)
        days_from_study_to_tests = ",".join(str(x) for x in days_from_study_to_tests).rstrip(',')
        return days_from_study_to_tests
    else:
        return days_to_tests


In [None]:
# Now calculate the derived properties by applying the lambda function "get_days_from_study_to_tests"
tdf['days_from_study_to_neg_covid_test'] = tdf.apply(lambda x: get_days_from_study_to_tests(x.Negative, x.days_to_study), axis=1)
tdf['days_from_study_to_pos_covid_test'] = tdf.apply(lambda x: get_days_from_study_to_tests(x.Positive, x.days_to_study), axis=1)
display(tdf[['days_from_study_to_pos_covid_test','days_from_study_to_neg_covid_test']])


In [None]:
study_props = ['type','project_id','submitter_id','cases.submitter_id','days_from_study_to_neg_covid_test','days_from_study_to_pos_covid_test','datasets.submitter_id']
df = tdf[study_props]
display(df)


In [None]:
# Filter out imaging studies that don't have a value for either temporal property
df = df.loc[(~df['days_from_study_to_neg_covid_test'].isna()) | (~df['days_from_study_to_pos_covid_test'].isna())].reset_index().drop(columns="index")
display(len(df))


In [None]:
## Compare old temporal props to new values
ost = copy.deepcopy(ast.rename(columns={'days_from_study_to_neg_covid_test':'dsnt',
                    'days_from_study_to_pos_covid_test':'dspt'}))

## Subset original data based on project_id, submitter_id and case_ids
ost = ost.loc[(ost['project_id'].isin(list(set(df['project_id'])))) & (ost['cases.submitter_id'].isin(list(set(df['cases.submitter_id'])))) & (ost['submitter_id'].isin(list(set(df['submitter_id']))))]
display(len(ost))


In [None]:
match = df.merge(ost[['submitter_id','dsnt','dspt']],how='left',on='submitter_id')
display(match[['days_from_study_to_neg_covid_test','dsnt','days_from_study_to_pos_covid_test','dspt']])


In [None]:
new = match.loc[match['days_from_study_to_neg_covid_test']!=match['dsnt']]
new = new.loc[new['days_from_study_to_pos_covid_test']!=new['dspt']].reset_index(drop=True)
display(new)


In [None]:
now = datetime.datetime.now()
today = "{}{}{}".format(now.year, now.month, now.day)
new_name = "new_temporal_imaging_study_{}_{}.tsv".format(len(new),today)
df_name = "all_temporal_imaging_study_{}_{}.tsv".format(len(df),today)

new.to_csv(new_name, sep='\t', index=False)
df.to_csv(df_name, sep='\t', index=False)

print("New temporal data for {} imaging studies in MIDRC saved to file: {}".format(len(new),new_name))
print("All temporal data for {} imaging studies in MIDRC saved to file: {}".format(len(df),df_name))


In [None]:
## Drop match props from new df
sub_df = copy.deepcopy(new.drop(columns=['dsnt','dspt'],errors='ignore'))


In [None]:
# Submit the new temporal data updates
data = {}
npids = list(set(new['project_id']))
for i in range(0,len(npids)):
    pid = npids[i]
    print("({}/{}) Submitting temporal prop updates to project: {}".format(i,len(npids),pid))
    if pid in spids:
        data[pid] = sexp.submit_df(df=sub_df.loc[sub_df["project_id"]==pid],project_id=pid, chunk_size=200)
    elif pid in vpids:
        data[pid] = vsexp.submit_df(df=sub_df.loc[sub_df["project_id"]==pid],project_id=pid, chunk_size=200)
print("Done.")

In [None]:
# Done
display([{i:(data[i]['responses'],data[i]['invalid'])} for i in data])