##### Extract PDF from Forum DICOM
----------------------------
v.4.0.0

## Imports

In [1]:
import re
import os
import os.path
from os import listdir
from datetime import date

import pydicom

## Input / Output Path

Set path to the Forum .dcm file containing the pdf and directory to store output

In [9]:

# Set to directory that contains the dcm files
input_dir = '/Users/nicolebrye/Desktop/HGC/Data_Management/vf-dicom'

# Set to directory where the pdfs will be placed
output_dir = '/Users/nicolebrye/Desktop/HGC/Data_Management/vf-dicom'


In [10]:
files = [file for file in listdir(input_dir) if file.endswith('.dcm')]

## Extract PDF and write to ouptut

File is written to out directory with a name containing the patient ID, eye, visit date, test type, and visit time (using .pdf extension)

In [8]:

# Tag for test type
x = 0x7717
y = 0x1001

# Set today's date
today = str(date.today())
today = today.replace('-', '')

produced = set()
missing  = list()
m_files  = list()

titles = []
for f in files:

    # File to read
    input_path = os.path.join(input_dir, f)
    
    # Read DICOM
    dcm = pydicom.read_file(input_path)
    pdf_data = bytearray(dcm.EncapsulatedDocument)
    
    # Set title and remove overview
    title = dcm.DocumentTitle
    titles.append(title)
    
    if title.startswith('OVERVIEW'):
        continue
    
    if 'SFA' in title:
        title = 'SFA'
    
    if 'GPA' in title:
        title = 'GPA'
        
    if 'THREE_IN_ONE' in title:
        title = 'THREE_IN_ONE'
    
    # Patient ID
    patient_id = dcm.PatientID
    
    # Eye and visit times
    if dcm.Laterality == 'L':
        eye = 'OS'
    else:
        eye = 'OD'
        
    visit_date = dcm.AcquisitionDateTime[0:8]
    visit_time = dcm.AcquisitionDateTime[8:12]
    
    # Determine the type of test and determine the pdf name
    try:
        test_type = re.search("\d{2}-\d", str(dcm[x, y])).group()
        
        out_path = output_dir + '/' + today + '_' + test_type
        pdf_path = [patient_id, eye, visit_date, title, test_type, visit_time]
        
        dup = "_".join([patient_id, eye, visit_date, title, visit_time])
        
        if dup in produced:
            continue
        produced.add(dup)
        
    except:
        pdf_path  = [patient_id, eye, visit_date, title, visit_time]
        
        missing.append("_".join(pdf_path))
        m_files.append(pdf_data)
        continue
        
    # Create output location, overwriting file if it exists
    if not os.path.isdir(out_path):
        os.makedirs(out_path)
        
    pdf_path = '_'.join(pdf_path) + '.pdf'
    pdf_path = os.path.join(out_path, pdf_path)
    pdf_file = open(pdf_path,'wb')

    # Write data to output
    pdf_file.write(pdf_data)

    pdf_file.flush()
    pdf_file.close()
    

## Write files with missing tags to a separate folder

Any pdfs that are missing information will be written to a separate folder with today's date and "Missing_tags"

In [10]:
for i, f in enumerate(m_files):
    
    # If the missing file has a duplicate continue
    if missing[i] in produced:
        continue
    else:
        out_path = output_dir + '/' + today + "_Missing_tags"
        
        # Create output location, overwriting file if it exists
        if not os.path.isdir(out_path):
            os.makedirs(out_path)
            
        pdf_path = missing[i] + '.pdf'
        pdf_path = os.path.join(out_path, pdf_path)
        pdf_file = open(pdf_path,'wb')
        
        # Write data to output
        pdf_file.write(f)

        pdf_file.flush()
        pdf_file.close()
        
print("All files have been placed into folders successfully :D")

All files have been placed into folders successfully :D
