In [1]:
import pandas as pd
import numpy as np
import requests
import json
import re
import csv
import os

In [2]:
# Designate parameters
folderPath = './data/RSNA/'

In [3]:
def get_template_columns(node):
    
    api_url = "https://data.midrc.org/api/v0/submission/template/" + node
    
    template_columns = requests.get(api_url).text.split('\t')
    template_columns =  list(filter(None, template_columns))
    
    mandatory_columns = [x for x in template_columns if x.startswith('*')]
    
    template_columns = [sub.replace('*', '') for sub in template_columns]
    mandatory_columns = [sub.replace('*', '') for sub in mandatory_columns]
    
    mandatory_columns = list(set([x.partition('.')[-1] if '.' in x else x for x in mandatory_columns]))

    return template_columns, mandatory_columns
    
def check_mandatory_columns(folderPath, files, columnNames):
    badFiles = dict()
    for file in files:
        filePath = folderPath + file
        fileColumns = pd.read_csv(filePath, index_col=0, nrows=0, sep='\t').columns.tolist()
        missing = [col for col in columnNames if col not in fileColumns]
        if len(missing) != 0:
            badFiles[file] = missing
    return badFiles

def check_extra_columns(folderPath, files, columnNames):
    extraColumns = dict()
    for file in files:
        filePath = folderPath + file
        fileColumns = pd.read_csv(filePath, index_col=0, nrows=0, sep='\t').columns.tolist()
        extra = [col for col in fileColumns if col not in columnNames]
        if len(extra) != 0:
            extraColumns[file] = extra
    return extraColumns

In [4]:
def main_missing_col(folder_path, batchName):
    
    badNodes = dict() # Collection of files and graph nodes with missing column names
    
    files = [x for x in os.listdir(folder_path) if batchName in x]
    
    nodes = list(set([x.partition('_' + batchName)[0] for x in files]))
    
    for node in nodes: 
        
        template_columns, mandatory_columns = get_template_columns(node)
        nodeFiles =  [x for x in files if node in x]
        badFiles = check_mandatory_columns(folderPath, nodeFiles, mandatory_columns)

        if len(badFiles) > 0:
            badNodes[node] = badFiles
            
    return badNodes

folderPath = folderPath # Parameterized variable
batchName = 'RSNA' 
main_missing_col(folderPath, batchName)

{}

In [5]:
def main_extra_col(folder_path):
    
    badNodes = dict()
    files = [x for x in os.listdir(folder_path) if 'RSNA' in x]
    nodes = list(set([x.partition('_RSNA')[0] for x in files]))
    
    for node in nodes: 
        
        template_columns, mandatory_columns = get_template_columns(node)
        nodeFiles =  [x for x in files if node in x]
        extraFiles = check_extra_columns(folderPath, nodeFiles, template_columns)
        
        if len(extraFiles) > 0:
            badNodes[node] = extraFiles
            
    return badNodes

folderPath = folderPath
main_extra_col(folderPath)

{'cr_series': {'cr_series_RSNA_20220214.tsv': ['contrast_bolus_agent_number',
   'view_position']},
 'cr_instance': {'cr_instance_RSNA_20220214.tsv': ['storage_urls']},
 'radiography_exam': {'radiography_exam_RSNA_20220214.tsv': ['view_position']},
 'ct_instance': {'ct_instance_RSNA_20220214.tsv': ['storage_urls']},
 'dx_series': {'dx_series_RSNA_20220214.tsv': ['contrast_bolus_agent_number',
   'view_position']},
 'ct_scan': {'ct_scan_RSNA_20220214.tsv': ['scan_notes']},
 'case': {'case_RSNA_20220204.tsv': ['zip']}}