##### Extract PDF from Forum DICOM
----------------------------
v.2.0.0

## Imports

In [1]:
import os
import os.path
import pandas as pd
import numpy as np
import datetime
from os import listdir
from datetime import date

import pydicom

## Input / Output Path

Set path to the Forum .dcm file containing the pdf and directory to store output

In [2]:

# Set to directory that contains the dcm files and where the pdfs will be placed
input_dir = '/Users/nicolebrye/Desktop/HGC/Projects/vf-dicom'

# Set to directory that contains the subset table, type "False" (without quotes) after the equal sign if there 
# is no subset being used.
subset = '/Users/nicolebrye/Desktop/HGC/Projects/vf-dicom'


In [3]:
files = [file for file in listdir(input_dir) if file.endswith('.dcm')]


In [4]:
# Create a class to throw exceptions
class ContinueOuter(Exception):
    pass

# Create a dictionary with the info
try:
    table = pd.read_csv(subset)
    info = {table["maskedID"][i] : [] for i in range(len(table.index))}

    for i in range(len(table.index)):
    
        key = table["maskedID"][i]
    
        try:
            Eye = table["Eye"][i]
        except:
            Eye = np.NaN
        
        try:
            day = table["Date"][i]
        except:
            day = np.NaN
    
        if type(day) == str:
            day = day.split("/")
            day = datetime.datetime(int(day[-1]), int(day[0]), int(day[1]))
            day = day.strftime('%y-%m-%d').replace("-", "")
    
        value = [Eye, day, 0]
        info[key].append(value)
        
except:
    info = None

## Extract PDF and write to ouptut

File is written to out directory with same name as input file (using .pdf extension)

In [10]:

# Tag for test type
x = 0x7717
y = 0x1001

# Set today's date
today = str(date.today())
today = today.replace('-', '')

for f in files:

    # File to read
    input_path = os.path.join(input_dir, f)
    
    # Read DICOM
    dcm = pydicom.read_file(input_path)
    pdf_data = bytearray(dcm.EncapsulatedDocument)
    
    # Set title and remove overview
    title = dcm.DocumentTitle
    
    if title.startswith('OVERVIEW'):
        continue
    
    # Patient ID
    try:
        patient_id = dcm.OriginalAttributesSequence[1].ModifiedAttributesSequence[0].PatientID
    except:
        patient_id = dcm.PatientID
    
    # Eye
    if dcm.ImageLaterality == 'L':
        eye = 'OS'
    else:
        eye = 'OD'
        
    # Date
    visit_date = dcm.StudyDate
    visit_time = dcm.AcquisitionDateTime[8:12]
    
    # Only output pdfs for DICOMS that are in the subset  
    if isinstance(subset, str):
        try:
            values = info[patient_id]
        
            for j in range(len(values)):
        
                entry_0 = values[j][0]
                entry_1 = values[j][1]
            
                if entry_0 == eye and entry_1 == visit_date[2:]:
                    values[j][-1] += 1
                    break
                    
                if entry_0 == eye and isinstance(entry_1, float):
                    values[j][-1] += 1
                    break
                
                if entry_1 == visit_date[2:] and isinstance(entry_0, float):
                    values[j][-1] += 1
                    break
            
                if entry_0 != eye and isinstance(entry_0, str):
                    if j == (len(values) - 1):
                        raise ContinueOuter
                    
                if entry_1 != visit_date[2:] and isinstance(entry_1, str):
                    if j == (len(values) - 1):
                        raise ContinueOuter
                    
                values[j][-1] += 1
    
        except ContinueOuter:
            continue
    
        except KeyError:
            continue
    
    # Determine the type of test
    char_list = [char for char in dcm[x,y].value if char.isdigit()]
    char_list.insert(2, "-")
    test_type = ""
    test_type = test_type.join(char_list)
    
    # Set output directory
    output_dir = input_dir + '/' + today + '_' + test_type
    
    # Create output location, overwriting file if it exists
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    
    separate = '_' 
    pdf_path = [patient_id, eye, visit_date, test_type, visit_time]
    pdf_path = separate.join(pdf_path) + '.pdf'
    pdf_path = os.path.join(output_dir, pdf_path)
    pdf_file = open(pdf_path,'wb')

    # Write data to output
    pdf_file.write(pdf_data)

    pdf_file.flush()
    pdf_file.close()


In [11]:
# Run this cell to find which entries in the table, if any, were not found in the subset of files
try:
    count = 0
    for i in info.keys():
        for j in info[i]:
            if j[-1] == 0:
                
                first = str(j[0])
                second = str(j[1])
                
                if first == "nan":
                    first = "(not given)"
                if second == "nan":
                    second = "(not given)"
                    
                print("A DICOM for ID " + i + ", eye " + first + ", and date " + second + " was not found")
                count += 1
            
    if count == 0:
        print("All files in the subset were converted to PDFs successfully :D")
        
except:
    print("All files were converted to PDFs successfully :D")

All files in the subset were converted to PDFs successfully :D
