##### Extract PDF from Forum DICOM
----------------------------
v.10.0.0

## Imports

In [1]:
import re
import os
import os.path
import shutil
from os import listdir
from datetime import date

import pydicom
import pdfquery as pq

# Image imports
import numpy as np
import PIL.Image as Image
from pdf2image import convert_from_path
import pytesseract as pt

## Input / Output Path

Set path to the Forum .dcm file containing the pdf and directory to store output

In [2]:
# Set to directory that contains the dcm files
input_dir = '/Volumes/glaucoma/SHILEY_VISUAL_FIELDS/Upload_DICOM_PDF'

# Set to directory where the pdfs will be placed
output_dir = '/Volumes/glaucoma/SHILEY_VISUAL_FIELDS/Upload_DICOM_PDF'


## Functions
Functions for extracting files and converting DICOMS to PDF

In [3]:
# Functions to extract DICOMS from a folder
def extract_dcm(fp):
    return [f for f in listdir(fp) if f.endswith('.dcm')]

# Function to extract subfolders
def extract_folders(fp):
    return [f for f in listdir(fp) if os.path.isdir(os.path.join(fp, f))]

# Function to check whether or not the pdf requires OCR
def check_OCR(fp):
    img = np.array(convert_from_path(fp)[0])[300:400, 100:200]
    ofp = os.path.join(output_dir, today + "_Requires_OCR")
    
    if not os.path.isdir(ofp):
        os.makedirs(ofp)
    
    if img.mean() == 255.0:
        shutil.move(fp, ofp)

In [4]:
# Function to convert DICOMS to PDF
def convert_dcm(files, input_dir):

    for f in files:

        # File to read
        input_path = os.path.join(input_dir, f)
    
        # Read DICOM
        dcm = pydicom.read_file(input_path)
        pdf_data = bytearray(dcm.EncapsulatedDocument)
    
        # Set title and remove overview
        title = dcm.DocumentTitle
    
        if title.startswith('OVERVIEW'):
            continue
    
        if 'SFA' in title:
            title = 'SFA'
    
        if 'GPA' in title:
            title = 'GPA'
        
        if 'THREE_IN_ONE' in title:
            title = 'THREE_IN_ONE'
    
        # Patient ID
        patient_id = dcm.PatientID
    
        # Eye and visit times
        if dcm.Laterality == 'L':
            eye = 'OS'
        else:
            eye = 'OD'
        
        visit_date = dcm.AcquisitionDateTime[0:8]
        visit_time = dcm.AcquisitionDateTime[8:12]
    
        # Determine the type of test and determine the pdf name
        try:
            test_type = re.search("\d{2}-\d", str(dcm[x, y])).group()
        
            out_path = output_dir + '/' + today + '_' + test_type
            pdf_path = [patient_id, eye, visit_date, title, test_type, visit_time]
        
            dup = "_".join([patient_id, eye, visit_date, title, visit_time])
        
            if dup in produced:
                continue
            produced.add(dup)
        
        except:
            pdf_path  = [patient_id, eye, visit_date, title, visit_time]
        
            missing.append("_".join(pdf_path))
            m_files.append(pdf_data)
            continue
        
        # Create output location, overwriting file if it exists
        op = os.path.join(out_path, title)
        if not os.path.isdir(op):
            os.makedirs(op)
        
        pdf_path = '_'.join(pdf_path) + '.pdf'
        pdf_path = os.path.join(op, pdf_path)
        pdf_file = open(pdf_path,'wb')

        # Write data to output
        pdf_file.write(pdf_data)

        pdf_file.flush()
        pdf_file.close()
        
        # Check to see if the pdf requires OCR
        check_OCR(pdf_path)

## Extract PDF and write to ouptut

File is written to out directory with a name containing the patient ID, eye, visit date, test type, and visit time (using .pdf extension)

In [5]:
files = extract_dcm(input_dir)

In [6]:
# Tag for test type
x = 0x7717
y = 0x1001

# Set today's date
today = str(date.today())
today = today.replace('-', '')

# Sets for missing and produced files
produced = set()
missing  = list()
m_files  = list()

# Convert DICOMS to PDF
if len(files) > 0:
    convert_dcm(files, input_dir)

else:
    folders = extract_folders(input_dir)
    
    for folder in folders:
        
        sub_dir = os.path.join(input_dir, folder)
        files   = extract_dcm(sub_dir)
        convert_dcm(files, sub_dir)

## DICOMs with missing information

Any pdfs that are missing information will be placed into a separate folder and have the necessary information extracted

In [9]:
out_path = output_dir + '/' + today + "_Missing_tags"
        
# Create output location, overwriting file if it exists
if not os.path.isdir(out_path):
    os.makedirs(out_path)

for i in np.arange(14271, len(m_files)):
    
    # If the missing file has a duplicate continue
    if missing[i] in produced:
        continue
        
    else:  
        pdf_path = missing[i] + '.pdf'
        pdf_path = os.path.join(out_path, pdf_path)
        pdf_file = open(pdf_path,'wb')
        
        # Write data to output
        pdf_file.write(m_files[i])

        pdf_file.flush()
        pdf_file.close()  
    
    
#for i, f in enumerate(m_files):

    # If the missing file has a duplicate continue
#     if missing[i] in produced:
#         continue
        
#     else:  
#         pdf_path = missing[i] + '.pdf'
#         pdf_path = os.path.join(out_path, pdf_path)
#         pdf_file = open(pdf_path,'wb')
        
#         # Write data to output
#         pdf_file.write(f)

#         pdf_file.flush()
#         pdf_file.close()   

In [10]:
i

18902

## Extract information and write to output

Extract test type information using PDFQuery

In [11]:
# Function to extract test type using pytesseract OCR
def extract_test(fp):
    
    img = np.array(convert_from_path(fp)[0])
    tst = img[200:250, 175:485]
    
    im = Image.fromarray(tst)
    im.save(os.path.join(input_dir, "image.jpeg"))
    
    test = pt.image_to_string(Image.open(os.path.join(input_dir, "image.jpeg")))
    out  = re.search("\d{2}-\d", test).group()
    
    return out

In [None]:
# List out files with missing tags
move_files = os.listdir(os.path.join(output_dir, today + "_Missing_tags"))

for i, f in enumerate(move_files):

    # Load in each pdf
    old_op = os.path.join(output_dir, today + "_Missing_tags", f)
    new_op = os.path.join(output_dir, today + "_Requires_OCR")
    
    if not os.path.isdir(new_op):
        os.makedirs(new_op)       
    
    pdf = pq.PDFQuery(old_op)
    pdf.load()
    
    bbox  = [430.17, 711.35225, 551.4795, 721.10225]
    
    # Extract values from the desired bbox
    query = pdf.pq(f'''LTTextLineHorizontal:overlaps_bbox("{','.join([str(int(coord)) for coord in bbox])}")''')
    query = [item.layout.get_text() for item in query]
    
    # Extract the test type
    try:
        test_type = re.search("\d{2}-\d", query[0]).group()
    except:
        
        try:
            test_type = extract_test(old_op)
        except:
            continue
        
        if not os.path.isdir(os.path.join(new_op, test_type)):
            os.makedirs(os.path.join(new_op, test_type))
            
        ilist  = f.split("_")
        fname  = ilist[:4] + [test_type] + ilist[4:]
        fname  = "_".join(fname)
        
        shutil.move(old_op, os.path.join(new_op, test_type, fname))
        continue
        
    
    # Place file into appropriate folder
    ilist  = f.split("_")
    method = ilist[3]
    fname  = ilist[:4] + [test_type] + ilist[4:]
    fname  = "_".join(fname)
    
    new_op = os.path.join(output_dir, today + "_" + test_type, method)
    
    # Create output location, overwriting file if it exists
    if not os.path.isdir(new_op):
        os.makedirs(new_op)
        
    shutil.move(old_op, os.path.join(new_op, fname))

    
# Remove the temporary image file 
try:
    os.remove(os.path.join(input_dir, "image.jpeg"))
except:
    pass

# Delete the Missing Tags Folder
try:
    os.rmdir(os.path.join(output_dir, today + "_Missing_tags"))
    print("All files have been placed into folders successfully :D")
except:
    print("There are still files with missing information")

In [15]:
i

0