# Generating ROI Names for Database

We want to parse each of the historical data files, and determine which OARs are present based off the corresponding structureset file. 

In [1]:
import dicom
import os
import sys
from extractROINames import preprocess_roi_name_ucla

In [2]:
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd())) + "/data/"

cases = {"5", "6", "7", "8", "9", "28", "29", "30", "31", "32"}
names = set()
cases_with_ptv = 0
for case in sorted(os.listdir(BASE_DIR)):
    
    if "zip" in case:
        continue
    
    #study = "UCLA_PR_" + case
    if "ANON" in case:
        try:
            x = dicom.read_file(BASE_DIR + case + "/RTSTRUCT0.dcm")
        except FileNotFoundError:
            continue
    else:
        x = dicom.read_file(BASE_DIR + case + "/structureset.dcm")
    
    # Structure set ROI sequence
    # Preprocessing

    case_ptv = False
    for seq in x.StructureSetROISequence:
        roi_name = preprocess_roi_name_ucla(seq.ROIName)
        
        names.add(roi_name)
        
        if "ptv" in roi_name:
            case_ptv = True
    
    if case_ptv:
        cases_with_ptv += 1

names = sorted(names)
print("ALL Unique names:" + str(names))
print("total names: " + str(len(names)))
print("do cases with ptv match total cases?" + str(len(cases) == cases_with_ptv))

ALL Unique names:['5610[cgy]', '6270[cgy]', '6930[cgy]', 'ant avoid', 'anus', 'aoptcapp', 'aring', 'aring2', 'artifact', 'atv3wl', 'avoid', 'avoid l', 'avoid lt', 'avoid lt pst', 'avoid rt pst', 'avoid1', 'bb', 'bbs', 'bladder', 'bladder opti', 'body', 'body avoid', 'bones', 'c0 rln', 'c0gtv', 'c0mandibler', 'c0mrenhancem', 'c0mri', 'c0pet', 'c0petln', 'c1ctv', 'c1dzext', 'c1gtv', 'c1gtvln', 'c1hypophx', 'c1larynx', 'c1ln', 'c1lnjk', 'c1lnlt', 'c1lnrt', 'c1lowerlip', 'c1ltongue', 'c1npx', 'c1opbed', 'c1oraltong', 'c1orophx', 'c1pet', 'c1postoptumorb', 'c1prim', 'c1primtumr', 'c1r oro', 'c1rnasoethmoid', 'c1seroma', 'c1site', 'c1temptumor', 'c1tongbot', 'c1tongdzextrt', 'c1total', 'c1tumor', 'c1tumrbdbucmuc', 'c1tvc', 'c1upnklt', 'c2 dzext', 'c2ctv', 'c2disext', 'c2dxext', 'c2dzext', 'c2dzext1', 'c2ln', 'c2localext', 'c2lwnklt', 'c2neckltup', 'c2necklwlt', 'c2necklwrt', 'c2neckrt', 'c2neckuplt', 'c2neckupltjk', 'c2neckuprt', 'c2neckuprtjk', 'c2nkupl', 'c2nkuplt', 'c2nkupr', 'c2nkuprt', 

In [3]:
x = dicom.read_file(BASE_DIR + "ANON0/RTSTRUCT0.dcm")
x.dir("")

['AccessionNumber',
 'DeidentificationMethod',
 'EthnicGroup',
 'InstanceCreationDate',
 'InstanceCreationTime',
 'InstanceNumber',
 'InstitutionName',
 'Manufacturer',
 'ManufacturerModelName',
 'Modality',
 'PatientBirthDate',
 'PatientID',
 'PatientIdentityRemoved',
 'PatientName',
 'PatientSex',
 'ROIContourSequence',
 'RTROIObservationsSequence',
 'RefdFrameOfReferenceSequence',
 'ReferencedFrameOfReferenceSequence',
 'ReferringPhysicianName',
 'SOPClassUID',
 'SOPInstanceUID',
 'SeriesDescription',
 'SeriesInstanceUID',
 'SeriesNumber',
 'SoftwareVersions',
 'SpecificCharacterSet',
 'StructureSetDate',
 'StructureSetLabel',
 'StructureSetROISequence',
 'StructureSetTime',
 'StudyDate',
 'StudyDescription',
 'StudyID',
 'StudyInstanceUID',
 'StudyTime']