#### Extracting PDF Information using `PDFQuery`

v4.0.0

---

#### Input and Output Directories

Set to the directory the contains the DCM files

In [1]:
input_dir = '/Volumes/glaucoma/SHILEY_VISUAL_FIELDS/20220907_24-2/SFA'

Set to the directory where the PDFs will be placed

In [2]:
output_dir = '/Volumes/glaucoma/SHILEY_VISUAL_FIELDS/20220907_24-2/SFA'

---

#### Imports

In [3]:
import_message = "There may be other dependencies you need to install, you can let Nicole know"

import os
import re
import numpy as np
import pandas as pd
import shutil


# Imports that sometimes cause troubles

# PDF Query
try:
    import pdfquery as pq
except:
    !pip install pdfquery
    try: 
        import pdfquery as pq
    except:
        print(import_message)


        
# PIL
try: 
    import PIL.Image as Image
except:
    !pip install pillow
    try:
        import PIL.Image as Image
    except:
        print(import_message)

        
        
# pdf2image
try:
    from pdf2image import convert_from_path
except:
    !pip install pdf2image
    try:
        from pdf2image import convert_from_path
    except:
        print(import_message)

---

#### Important Global Variables

In [4]:
# List of files
files = [x for x in os.listdir(input_dir) if x.endswith(".pdf")]

In [5]:
# Column names for output
cols = ['Batch_UID', 'Exam_UID', 'InstrumentModel', 'InstrumentSerialNumber', 
        'InstrumentSoftwareVersion', 'PatientID', 'GIVEN_NAME', 'LAST_NAME', 
        'StudyCode', 'aeDOB', 'Eye', 'SeriesDateTime',
        'aeExamDate', 'aeExamTime', 'ExamDuration', 'aeIsShileyClinicHFAExam', 'aeDIGSTestType',
        'TestType', 'TestPattern', 'TestStrategy', 'StimulusColor', 'StimulusSize', 
        'BackgroundColor', 'FixationTarget', 'FixationMonitor', 'TrialRXSphereRaw', 
        'TrialRXCylRaw', 'TrialRXAxisRaw', 'PupilDiameter', 'VAType', 'BlindSpotX','BlindSpotY',
        'BlindSpotStimulusSize', 'FalseNegativePercent', 'FalsePositivePercent',
        'aeFixationCheckPercentage', 'FovealResult', 'FovealThreshold', 'ClinicalNotes',
        'SFStatus', 'SF', 'SFProb', 'SWAPFTGeneralHeight', 'GHTType', 'MD', 'MDProb', 'PSD',
        'PSDProb', 'CPSD', 'CPSDProb', 'FovealThresholdProb', 'aePDPCen4LT5Count', 
        'aeHasHighRawThreshold', 'FLAGAssessment', 'FLAGSeverity', 'AGISScore', 'AGISNas', 
        'AGISInf', 'AGISSup', 'GHSupThrSum', 'GHSupThrMean', 'GHSupThrStd', 'GHSupTDSum',
        'GHSupTDMean', 'GHSupTDStd', 'GHSupPDSum', 'GHSupPDMean', 'GHSupPDStd', 'GHSupPDCntLT50p',
        'GHSupPDCntLT10p', 'GHSupNasThrSum', 'GHSupNasThrMean', 'GHSupNasThrStd', 'GHSupNasTDSum',
        'GHSupNasTDMean', 'GHSupNasTDStd', 'GHSupNasPDSum', 'GHSupNasPDMean', 'GHSupNasPDStd',
        'GHSupNasPDCntLT50p', 'GHSupNasPDCntLT10p', 'GHInfThrSum', 'GHInfThrMean', 'GHInfThrStd',
        'GHInfTDSum', 'GHInfTDMean', 'GHInfTDStd', 'GHInfPDSum', 'GHInfPDMean', 'GHInfPDStd',
        'GHInfPDCntLT50p', 'GHInfPDCntLT10p', 'GHInfNasThrSum', 'GHInfNasThrMean', 'GHInfNasThrStd',
        'GHInfNasTDSum', 'GHInfNasTDMean', 'GHInfNasTDStd', 'GHInfNasPDSum', 'GHInfNasPDMean',
        'GHInfNasPDStd', 'GHInfNasPDCntLT50p', 'GHInfNasPDCntLT10p', 'GHCentralThrSum',
        'GHCentralThrMean', 'GHCentralThrStd', 'GHCentralTDSum', 'GHCentralTDMean', 'GHCentralTDStd',
        'GHCentralPDSum', 'GHCentralPDMean', 'GHCentralPDStd', 'GHCentralPDCntLT50p', 
        'GHCentralPDCntLT10p', 'GHTemporalThrSum', 'GHTemporalThrMean', 'GHTemporalThrStd',
        'GHTemporalTDSum', 'GHTemporalTDMean', 'GHTemporalTDStd', 'GHTemporalPDSum', 
        'GHTemporalPDMean', 'GHTemporalPDStd', 'GHTemporalPDCntLT50p', 'GHTemporalPDCntLT10p',
        'N9_S27_Thr', 'N3_S27_Thr', 'T3_S27_Thr', 'T9_S27_Thr', 'N15_S21_Thr', 'N9_S21_Thr',
        'N3_S21_Thr', 'T3_S21_Thr', 'T9_S21_Thr', 'T15_S21_Thr', 'N21_S15_Thr', 'N15_S15_Thr', 
        'N9_S15_Thr', 'N3_S15_Thr', 'T3_S15_Thr', 'T9_S15_Thr', 'T15_S15_Thr', 'T21_S15_Thr',
        'N27_S9_Thr', 'N21_S9_Thr', 'N15_S9_Thr', 'N9_S9_Thr', 'N3_S9_Thr', 'T3_S9_Thr', 'T9_S9_Thr',
        'T15_S9_Thr', 'T21_S9_Thr', 'T27_S9_Thr', 'N27_S3_Thr', 'N21_S3_Thr', 'N15_S3_Thr', 
        'N9_S3_Thr',  'N3_S3_Thr', 'T3_S3_Thr', 'T9_S3_Thr', 'T21_S3_Thr', 'T27_S3_Thr', 
        'N27_I3_Thr', 'N21_I3_Thr', 'N15_I3_Thr', 'N9_I3_Thr', 'N3_I3_Thr', 'T3_I3_Thr', 'T9_I3_Thr', 
        'T21_I3_Thr', 'T27_I3_Thr', 'N27_I9_Thr', 'N21_I9_Thr', 'N15_I9_Thr', 'N9_I9_Thr', 
        'N3_I9_Thr', 'T3_I9_Thr', 'T9_I9_Thr', 'T15_I9_Thr', 'T21_I9_Thr', 'T27_I9_Thr', 
        'N21_I15_Thr', 'N15_I15_Thr', 'N9_I15_Thr', 'N3_I15_Thr', 'T3_I15_Thr', 'T9_I15_Thr', 
        'T15_I15_Thr', 'T21_I15_Thr', 'N15_I21_Thr', 'N9_I21_Thr', 'N3_I21_Thr', 'T3_I21_Thr',
        'T9_I21_Thr', 'T15_I21_Thr', 'N9_I27_Thr', 'N3_I27_Thr', 'T3_I27_Thr', 'T9_I27_Thr', 
        'N9_S27_TD', 'N3_S27_TD', 'T3_S27_TD', 'T9_S27_TD', 'N15_S21_TD', 'N9_S21_TD', 'N3_S21_TD', 
        'T3_S21_TD', 'T9_S21_TD', 'T15_S21_TD', 'N21_S15_TD', 'N15_S15_TD', 'N9_S15_TD', 'N3_S15_TD',
        'T3_S15_TD', 'T9_S15_TD', 'T15_S15_TD', 'T21_S15_TD', 'N27_S9_TD', 'N21_S9_TD', 'N15_S9_TD', 
        'N9_S9_TD', 'N3_S9_TD', 'T3_S9_TD', 'T9_S9_TD', 'T15_S9_TD', 'T21_S9_TD', 'T27_S9_TD', 
        'N27_S3_TD', 'N21_S3_TD', 'N15_S3_TD', 'N9_S3_TD', 'N3_S3_TD', 'T3_S3_TD', 'T9_S3_TD', 
        'T21_S3_TD', 'T27_S3_TD', 'N27_I3_TD', 'N21_I3_TD', 'N15_I3_TD', 'N9_I3_TD', 'N3_I3_TD',
        'T3_I3_TD', 'T9_I3_TD', 'T21_I3_TD', 'T27_I3_TD', 'N27_I9_TD', 'N21_I9_TD', 'N15_I9_TD', 
        'N9_I9_TD', 'N3_I9_TD', 'T3_I9_TD', 'T9_I9_TD', 'T15_I9_TD', 'T21_I9_TD', 'T27_I9_TD',
        'N21_I15_TD', 'N15_I15_TD', 'N9_I15_TD', 'N3_I15_TD', 'T3_I15_TD', 'T9_I15_TD', 'T15_I15_TD',
        'T21_I15_TD', 'N15_I21_TD', 'N9_I21_TD', 'N3_I21_TD', 'T3_I21_TD', 'T9_I21_TD', 'T15_I21_TD',
        'N9_I27_TD', 'N3_I27_TD', 'T3_I27_TD', 'T9_I27_TD', 'N9_S27_PD', 'N3_S27_PD', 'T3_S27_PD', 
        'T9_S27_PD', 'N15_S21_PD', 'N9_S21_PD', 'N3_S21_PD', 'T3_S21_PD', 'T9_S21_PD', 'T15_S21_PD',
        'N21_S15_PD', 'N15_S15_PD', 'N9_S15_PD', 'N3_S15_PD', 'T3_S15_PD', 'T9_S15_PD', 'T15_S15_PD',
        'T21_S15_PD', 'N27_S9_PD', 'N21_S9_PD', 'N15_S9_PD', 'N9_S9_PD', 'N3_S9_PD', 'T3_S9_PD', 
        'T9_S9_PD', 'T15_S9_PD', 'T21_S9_PD', 'T27_S9_PD', 'N27_S3_PD', 'N21_S3_PD', 'N15_S3_PD', 
        'N9_S3_PD', 'N3_S3_PD', 'T3_S3_PD', 'T9_S3_PD', 'T21_S3_PD', 'T27_S3_PD', 'N27_I3_PD',
        'N21_I3_PD', 'N15_I3_PD', 'N9_I3_PD', 'N3_I3_PD', 'T3_I3_PD', 'T9_I3_PD', 'T21_I3_PD', 
        'T27_I3_PD', 'N27_I9_PD', 'N21_I9_PD', 'N15_I9_PD', 'N9_I9_PD', 'N3_I9_PD', 'T3_I9_PD', 
        'T9_I9_PD', 'T15_I9_PD', 'T21_I9_PD', 'T27_I9_PD', 'N21_I15_PD', 'N15_I15_PD', 'N9_I15_PD',
        'N3_I15_PD', 'T3_I15_PD', 'T9_I15_PD', 'T15_I15_PD', 'T21_I15_PD', 'N15_I21_PD', 'N9_I21_PD', 
        'N3_I21_PD', 'T3_I21_PD', 'T9_I21_PD', 'T15_I21_PD', 'N9_I27_PD', 'N3_I27_PD', 'T3_I27_PD',
        'T9_I27_PD', 'N9_S27_TDP', 'N3_S27_TDP', 'T3_S27_TDP', 'T9_S27_TDP', 'N15_S21_TDP', 'N9_S21_TDP', 
        'N3_S21_TDP', 'T3_S21_TDP', 'T9_S21_TDP', 'T15_S21_TDP', 'N21_S15_TDP', 'N15_S15_TDP', 
        'N9_S15_TDP', 'N3_S15_TDP', 'T3_S15_TDP', 'T9_S15_TDP', 'T15_S15_TDP', 'T21_S15_TDP', 
        'N27_S9_TDP', 'N21_S9_TDP', 'N15_S9_TDP', 'N9_S9_TDP', 'N3_S9_TDP', 'T3_S9_TDP', 'T9_S9_TDP', 
        'T15_S9_TDP', 'T21_S9_TDP', 'T27_S9_TDP', 'N27_S3_TDP', 'N21_S3_TDP', 'N15_S3_TDP', 'N9_S3_TDP', 
        'N3_S3_TDP', 'T3_S3_TDP', 'T9_S3_TDP', 'T21_S3_TDP', 'T27_S3_TDP', 'N27_I3_TDP', 'N21_I3_TDP',
        'N15_I3_TDP', 'N9_I3_TDP', 'N3_I3_TDP', 'T3_I3_TDP', 'T9_I3_TDP', 'T21_I3_TDP', 'T27_I3_TDP',
        'N27_I9_TDP', 'N21_I9_TDP', 'N15_I9_TDP', 'N9_I9_TDP', 'N3_I9_TDP', 'T3_I9_TDP', 'T9_I9_TDP',
        'T15_I9_TDP', 'T21_I9_TDP', 'T27_I9_TDP', 'N21_I15_TDP', 'N15_I15_TDP', 'N9_I15_TDP', 
        'N3_I15_TDP', 'T3_I15_TDP', 'T9_I15_TDP', 'T15_I15_TDP', 'T21_I15_TDP', 'N15_I21_TDP', 
        'N9_I21_TDP', 'N3_I21_TDP', 'T3_I21_TDP', 'T9_I21_TDP', 'T15_I21_TDP', 'N9_I27_TDP', 
        'N3_I27_TDP', 'T3_I27_TDP', 'T9_I27_TDP', 'N9_S27_PDP', 'N3_S27_PDP', 'T3_S27_PDP', 
        'T9_S27_PDP', 'N15_S21_PDP', 'N9_S21_PDP', 'N3_S21_PDP', 'T3_S21_PDP', 'T9_S21_PDP', 
        'T15_S21_PDP', 'N21_S15_PDP', 'N15_S15_PDP', 'N9_S15_PDP', 'N3_S15_PDP', 'T3_S15_PDP', 
        'T9_S15_PDP', 'T15_S15_PDP', 'T21_S15_PDP', 'N27_S9_PDP', 'N21_S9_PDP', 'N15_S9_PDP',
        'N9_S9_PDP', 'N3_S9_PDP', 'T3_S9_PDP', 'T9_S9_PDP', 'T15_S9_PDP', 'T21_S9_PDP', 'T27_S9_PDP', 
        'N27_S3_PDP', 'N21_S3_PDP', 'N15_S3_PDP', 'N9_S3_PDP', 'N3_S3_PDP', 'T3_S3_PDP', 
        'T9_S3_PDP', 'T21_S3_PDP', 'T27_S3_PDP', 'N27_I3_PDP', 'N21_I3_PDP', 'N15_I3_PDP',
        'N9_I3_PDP', 'N3_I3_PDP', 'T3_I3_PDP', 'T9_I3_PDP', 'T21_I3_PDP', 'T27_I3_PDP',
        'N27_I9_PDP', 'N21_I9_PDP', 'N15_I9_PDP', 'N9_I9_PDP', 'N3_I9_PDP', 'T3_I9_PDP', 
        'T9_I9_PDP', 'T15_I9_PDP', 'T21_I9_PDP', 'T27_I9_PDP', 'N21_I15_PDP', 'N15_I15_PDP',
        'N9_I15_PDP', 'N3_I15_PDP', 'T3_I15_PDP', 'T9_I15_PDP', 'T15_I15_PDP', 'T21_I15_PDP',
        'N15_I21_PDP', 'N9_I21_PDP', 'N3_I21_PDP', 'T3_I21_PDP', 'T9_I21_PDP', 'T15_I21_PDP',
        'N9_I27_PDP', 'N3_I27_PDP', 'T3_I27_PDP', 'T9_I27_PDP', 'cAutoQCStatus', 'QCFieldUsable',
        'QCReliable', 'cQCFN33Status', 'cQCAHSManualStatus', 'cQCRimArtifactStatus',
        'cQCInattentionStatus', 'cQCLearningEffectStatus', 'cQCFatigueStatus', 'cQCFixationStatus', 
        'cQCOtherDefectStatus', 'cQCUnreliableByTechnicianStatus', 'cQCUnaccPupilSizeStatus','VFI',
        'kPrevUsable_ExamTimeStamp', 'aeExamTimeStamp', 'kNextUsable_ExamTimeStamp', 
        'kPrevUsable_FLAGAssessment', 'kNextUsable_FLAGAssessment', 
        'cFLAG_Confirmation_Status_ByTestType', 'cIsABNORMAL_FLAG_Confirmed_ByTestType',
        'cIsNORMAL_FLAG_Confirmed_ByTestType', 'cCnt_TDP_LessThan5', 'cCnt_PDP_LessThan5',
        'cCnt_TDP_LessThan2', 'cCnt_PDP_LessThan2', 'cCnt_TDP_LessThan1', 'cCnt_PDP_LessThan1',
        'cCnt_TDP_LessThan05', 'cCnt_PDP_LessThan05', 'kUsedADAGESBL09','sFLAGAbn3ConsecConfirmed',
        'sFLAGAbn3ConsecUnconfirmed', 'sFLAGNorm3ConsecConfirmed', 'sFLAGNorm3ConsecUnconfirmed',
        'LowPatientReliabilityStatus']

#### Functions for PDF Querying

#### `overlaps_bbox`

In [6]:
def overlaps_bbox(pdf, bbox):
    text_query = pdf.pq(f'''LTTextLineHorizontal:overlaps_bbox("{','.join([str(int(coord)) for coord in bbox])}")''')
    text_query = [item.layout.get_text() for item in text_query]
    return text_query

#### `contains_query`

In [7]:
def contains_query(pdf, pattern):
    text_query = pdf.pq(f'''LTTextLineHorizontal:contains("{pattern}")''')
    return text_query[0].layout.get_text().strip()

#### `key_value_output`

Takes in a pdf object and converts the main text fields to key value pairs. These key value pairs are stored in a dictionary for further querying and cleaning

In [8]:
def key_value_output(pdf):
    
    # Empty dictionary to be populated
    info = {}
    
    # Patient Info
    p_pats = ["Patient", "Date of Birth", "Gender", "Patient ID"]
    p_info = [contains_query(pdf, pat).split(':') for pat in p_pats]
    p_info = dict(map(lambda x: [x[0], x[1].strip()], p_info))
    
    info.update(p_info)
    
    # Eye Info
    e_pats = ["Fixation Monitor", "Fixation Target", "Fixation Losses", "False POS Errors",
              "False NEG Errors", "Test Duration", "Stimulus", "Background", "Strategy",
              "Pupil Diameter", "Visual Acuity", "Rx", "Date:", "Time", "Age", "Fovea"]
    
    for i, x in enumerate(e_pats):
        
        bbox = list(pdf.pq(f'LTTextLineHorizontal:contains("{x}")')[0].layout.bbox)
        bbox[2] += 100
        temp = overlaps_bbox(pdf, bbox)
        
        if len(temp) == 1:
            temp = temp[0].split(":")
            
        if not temp[0].startswith(x):
            temp[0], temp[1] = temp[1], temp[0]

        try:
            key = temp[0].strip().strip(":")
            val = temp[1].strip()

        except IndexError:
            key = temp[0].strip().strip(":")
            val = ""

        info[key] = val 
        
    # Other info
    o_pats = ["GHT", "VFI", "MD", "PSD"]
    
    for i, x in enumerate(o_pats):
        
        bbox = list(pdf.pq(f'LTTextLineHorizontal:contains("{x}")')[0].layout.bbox)
        bbox[2] += 100
        temp = overlaps_bbox(pdf, bbox)
        
        if len(temp) == 1:
            temp = temp[0].split(":")
            
        if not temp[0].startswith(x):
            temp[0], temp[1] = temp[1], temp[0]
        
        try:
            val = temp[1].strip()

        except IndexError:
            val = ""

        info[x] = val 
        
    try:
        info["Reliability"] = contains_query(pdf, '***').strip("***").strip()
    except:
        info["Reliability"] = ""
        
    info["Eye"] = pdf.pq('LTTextLineHorizontal:contains("Single Field Analysis")')[0]\
    .layout.get_text()[:2]
    
    # Footer
    footer_bbox = list(pdf.pq(f'LTTextLineHorizontal:contains("Version")')[0].layout.bbox)
    footer_bbox[0] = 0
    footer_bbox[2] = 560
    info["Instrument"], info["Version"], info["Created"] = overlaps_bbox(pdf, footer_bbox)[:-1]
    
    info["Test Pattern"] = contains_query(pdf, "Threshold Test")
    
    return info

#### `clean_output`

Formats values of a dictionary produced by `key_value_output`. Stores the newly cleaned values in a DataFrame.

In [9]:
def clean_output(info, output):
    
    # Save the keys in a Series
    KEYS = pd.Series(info.keys())
    
    # Important Dates
    DOB = pd.to_datetime(info["Date of Birth"])
    VIS = pd.to_datetime(info["Date"])
    
    # Patient Info
    output["PatientID"]  = info["Patient ID"]
    
    try:
        output["GIVEN_NAME"] = info["Patient"].split(",")[1]
    except IndexError:
        output["GIVEN_NAME"] = ""
        
    output["LAST_NAME"]  = info["Patient"].strip(",")
    output["aeDOB"] = str(DOB.month) + "/" + str(DOB.day) + "/" + str(DOB.year) #[-2:]
    
    # Visit Info
    output["aeExamTime"] = info["Time"].split()[0]
    output["aeExamDate"] = str(VIS.month) + "/" + str(VIS.day) + "/" + str(VIS.year) #[-2:]
    output["TestStrategy"] = info["Strategy"]
    output["ExamDuration"] = info["Test Duration"]
    
    # Eye Info
    try:
        output["VAType"] = float(info["Visual Acuity"])
    except:
        pass
    
    MDKEY  = KEYS[KEYS.str.contains("MD").idxmax()]
    PSDKEY = KEYS[KEYS.str.contains("PSD").idxmax()]
    
    # MD and MDProb
    if info[MDKEY].endswith("dB"):
        output["MD"]     = info[MDKEY].strip(" dB")
        output["MDProb"] = "Not Significant"
        
    elif info[MDKEY].endswith("%"):
        output["MD"]     = info[MDKEY].split(" dB ")[0]
        output["MDProb"] = info[MDKEY].split(" dB ")[1]

    
    # PSD and PSDProb
    if info[PSDKEY].endswith("dB"):
        output["PSD"]     = info[PSDKEY].strip(" dB")
        output["PSDProb"] = "Not Significant"
        
    elif info[PSDKEY].endswith("%"):
        output["PSD"]     = info[PSDKEY].split(" dB ")[0]
        output["PSDProb"] = info[PSDKEY].split(" dB ")[1]

    
    # Other Info
    output["FixationTarget"]  = info["Fixation Target"]
    output["FixationMonitor"] = info["Fixation Monitor"]
    output["StimulusSize"]    = info["Stimulus"].split(", ")[0]
    output["StimulusColor"]   = info["Stimulus"].split()[1]
    output["BackgroundColor"] = output["StimulusColor"] + " (" + info["Background"] + ")"
    
    try:
        output["PupilDiameter"]   = info["Pupil Diameter"].split()[0]
    except:
        pass
    
    output["FovealThreshold"] = info["Fovea"].split()[0]
    output["GHTType"]         = info["GHT"]
    output["VFI"]             = info["VFI"].strip("%")
    
    err, trials = info["Fixation Losses"].split()[0].split("/")
    output["aeFixationCheckPercentage"] = round(float(err) / float(trials) * 100, 2)
    output["FalsePositivePercent"] = info["False POS Errors"].strip(" XX").strip("%")
    output["FalseNegativePercent"] = info["False NEG Errors"].strip(" XX").strip("%")
    
    # Populate the data frame with other acquired info
    output["Eye"]              = info["Eye"]
    output["TestType"]         = info["Test Pattern"].split()[2]
    output["TestPattern"]      = info["Test Pattern"].split()[0] + info["Test Pattern"].split()[1]
    
    try:
        output["TrialRXSphereRaw"] = re.findall("(\d+\.\d+)\s?DS", info["Rx"])[0]
    except:
        output["TrialRXSphereRaw"] = ""
        
    try:
        output["TrialRXCylRaw"]    = re.findall("(\d+\.\d+)\s?DC", info["Rx"])[0]
    except:
        output["TrialRXCylRaw"]    = ""
    
    try:
        output["TrialRXAxisRaw"]   = re.findall("X(.*)", info["Rx"])[0]
    except:
        output["TrialRXAxisRaw"]   = ""

    inst_info = info["Instrument"].split()
    
    output["InstrumentModel"]  = inst_info[-1].split("-")[0]
    output["InstrumentSerialNumber"] = inst_info[-1].split("-")[1][:4]
    output["InstrumentSoftwareVersion"] = inst_info[-1].split("/")[-1]
    
    output["LowPatientReliabilityStatus"] = info["Reliability"]
    
    return output

#### `fovea_probability`

In [10]:
def fovea_probability(filename):
    
    # Convert the pdf to an image
    img = np.array(convert_from_path(os.path.join(input_dir, filename))[0])
    sub = img[585:615, 495:525]
    avg = sub.mean()
    
    # Check if probability is needed
    if info["Fovea"] == "Off":
        return ""
    
    # Determine the probability value according to the mean pixel value
    if avg == 255.0:
        return "Not Significant"
    if avg < 125.0:
        return "P < 0.5%"
    if (avg >= 125.0 and avg < 190.0):
        return "P < 1%"
    if (avg >= 190.0 and avg < 225.0):
        return "P < 2%"
    if (avg >= 225.0 and avg < 247.0):
        return "P < 5%"
    if (avg >= 247.0 and avg < 255.0):
        return "Not Significant"       

#### `format_matrix`

Extracts and formats text from a VF plot

In [11]:
def format_matrix(pdf, bbox, prefix = "Thr"):
    
    # Helper function to determine if metadata is inputted correctly
    def value_check(values, bboxes, diff):
    
        # Rows and columns for each bbox
        out  = values
        rows = list(map(lambda x: x[1], bboxes))
        cols = list(map(lambda x: x[0], bboxes))

        levels = [4, 6, 8, 8, 8, 8, 6, 8]

        # Check each row has the proper number of entries
        start = 0
        count = 0
        for i, x in enumerate(levels):

            rblock = rows[start:(start + x)]
            cblock = cols[start:(start + x)]
            check  = pd.Series(rblock).nunique()

            if check != 1:
                idx = start + pd.Series(cblock[:-1]).diff().idxmax()
                out = out[:idx] + [out[idx][0]] + [out[idx][1]] + out[(idx+1):]

                count += 1
                start -= 1

            if len(rblock) != x:
                idx = start + pd.Series(cblock).diff().idxmax()
                out = out[:idx] + [out[idx][0]] + [out[idx][1]] + out[(idx+1):]

                count += 1
                start -= 1

            if count == diff:
                break

            start += x

        return out
    
    # Ordering
    if info["Eye"] == "OS":
        ORDER = ['T9_S21', 'T3_S21', 'N3_S21', 'N9_S21', 'T15_S15',
                 'T9_S15', 'T3_S15', 'N3_S15', 'N9_S15', 'N15_S15',
                 'T21_S9', 'T15_S9', 'T9_S9', 'T3_S9', 'N3_S9','N9_S9',
                 'N15_S9', 'N21_S9', 'T21_S3', 'T15_S3', 'T9_S3','T3_S3',
                 'N3_S3', 'N9_S3', 'N15_S3', 'N21_S3', 'N27_S3', 
                 'T21_I3', 'T15_I3', 'T9_I3', 'T3_I3', 'N3_I3', 'N9_I3',
                 'N15_I3', 'N21_I3', 'N27_I3', 'T21_I9', 'T15_I9',
                 'T9_I9', 'T3_I9', 'N3_I9', 'N9_I9', 'N15_I9', 'N21_I9',
                 'T15_I15', 'T9_I15', 'T3_I15', 'N3_I15', 'N9_I15',
                 'N15_I15', 'T9_I21', 'T3_I21', 'N3_I21', 'N9_I21'
                ]
        
    else:
        ORDER = ['N9_S21', 'N3_S21', 'T3_S21', 'T9_S21', 'N15_S15',
                 'N9_S15', 'N3_S15', 'T3_S15', 'T9_S15', 'T15_S15',
                 'N21_S9', 'N15_S9', 'N9_S9', 'N3_S9', 'T3_S9',
                 'T9_S9', 'T15_S9', 'T21_S9', 'N27_S3', 'N21_S3',
                 'N15_S3', 'N9_S3', 'N3_S3', 'T3_S3', 'T9_S3',
                 'T15_S3', 'T21_S3', 'N27_I3', 'N21_I3', 'N15_I3',
                 'N9_I3', 'N3_I3', 'T3_I3', 'T9_I3', 'T15_I3',
                 'T21_I3', 'N21_I9', 'N15_I9', 'N9_I9', 'N3_I9',
                 'T3_I9', 'T9_I9', 'T15_I9', 'T21_I9', 'N15_I15',
                 'N9_I15', 'N3_I15', 'T3_I15', 'T9_I15', 'T15_I15',
                 'N9_I21', 'N3_I21', 'T3_I21', 'T9_I21']
        
    ORDER.remove("T15_S3")
    ORDER.remove("T15_I3")
    
    mat = pdf.extract([
        ("values", f'LTTextLineHorizontal:in_bbox({bbox})')
    ])
    
    query = mat["values"]
    
    if query[0].layout.get_text() == 'MD Threshold exceeded.\n':
        return output, ORDER
    
    values = []
    bboxes = []
    
    for i, x in enumerate(query):
        values.append(x.layout.get_text().strip())
        bboxes.append(x.layout.bbox + (i,))
        
    bboxes.sort(key = lambda x: (-x[1], x[0]))
    
    new_idx = [x[-1] for x in bboxes]
    values  = list(pd.Series(values)[new_idx])
    values = " ".join(values).split()
    
    # Check for incorrect values
    if len(values) < 52:
        values = value_check(values, bboxes, 52 - len(values))
    
    # Remove blind spots
    if prefix == "Thr":
        if info["Eye"] == "OS":
            blind_spots = [19, 28]
        else:
            blind_spots = [25, 34]

        del values[blind_spots[1]]
        del values[blind_spots[0]]
        
    # Unlist values
    else:
        values = list(map(lambda x: x.strip().split(), values))
        values = sum(values, [])
        
    # Order the plot points    
    tab_order = [x + "_" + prefix for x in ORDER]
    output[tab_order] = values
    
    return output, ORDER

#### `probability_points`

Extracts and converts symbols to their respective probabilities

In [12]:
def probability_points(filename, start1, end1, start2, end2, box_width):
    
    # Convert the pdf to an image
    img = np.array(convert_from_path(os.path.join(input_dir, filename))[0])
    
    # Loop through each box and calculate mean pixel value
    PROBS = []
    for i in np.arange(start1, end1, box_width):
        for j in np.arange(start2, end2, box_width):
            
            point = img[i:(i+box_width), j:(j+box_width)]
            value = point[2:(box_width - 2), 7:(box_width-2)].mean()
            
            # Determine the probability value according to the mean pixel value
            if value == 255.0:
                continue
            if value < 125.0:
                PROBS.append("P < 0.5%")
            if (value >= 125.0 and value < 190.0):
                PROBS.append("P < 1%")
            if (value >= 190.0 and value < 225.0):
                PROBS.append("P < 2%")
            if (value >= 225.0 and value < 247.0):
                PROBS.append("P < 5%")
            if (value >= 247.0 and value < 255.0):
                PROBS.append("Not Significant")
                
    return PROBS

#### Extracting Text from PDFs

In [None]:
CSVS_242 = []
for i, x in enumerate(files):
    
    try:
        # Create an empty DataFrame
        output = pd.DataFrame(columns=cols, index=[0])

        # Load in each pdf
        pdf = pq.PDFQuery(os.path.join(input_dir, x))
        pdf.load()

        # Key value pairs
        info = key_value_output(pdf)

        # Populate output
        output = clean_output(info, output)

        # Plot points
        try:
            output, ORDER = format_matrix(pdf, '"123.612215, 450, 312.100175, 620"')
            output, ORDER = format_matrix(pdf, '"0, 330, 210, 450"', "TD")
            output, ORDER = format_matrix(pdf, '"210, 330, 400, 450"', "PD")
        except:
            # Move the files that throw errors into a different directory
            if not os.path.isdir(os.path.join(input_dir, "error_files")):
                os.makedirs(os.path.join(input_dir, "error_files"))

            shutil.move(os.path.join(input_dir, x), os.path.join(input_dir, "error_files", x))

        # Calculate the probability points for both plots
        if info["Eye"] == "OD":
            TDP = probability_points(x, 1498, 1762, 188, 485, 33)
            PDP = probability_points(x, 1498, 1762, 692, 989, 33)
        else:
            TDP = probability_points(x, 1498, 1762, 221, 518, 33)
            PDP = probability_points(x, 1498, 1762, 725, 1022, 33)

        tdp_order = [x + "_TDP" for x in ORDER]
        pdp_order = [x + "_PDP" for x in ORDER]

        try:
            output[tdp_order] = TDP
        except ValueError:
            pass

        try:
            output[pdp_order] = PDP
        except ValueError:
            pass

        # Fovea probability variable
        output["FovealThresholdProb"] = fovea_probability(x)

        if info["Test Pattern"] == "Central 24-2 Threshold Test":
            CSVS_242.append(output)

        print(str(i + 1) + " PDF(s) processed")
        
    except:
        print("PDF #" + str(i + 1) + " was skipped" + f" ({x})")

1 PDF(s) processed
2 PDF(s) processed
3 PDF(s) processed
4 PDF(s) processed
5 PDF(s) processed
6 PDF(s) processed
7 PDF(s) processed
8 PDF(s) processed
9 PDF(s) processed
10 PDF(s) processed
11 PDF(s) processed
12 PDF(s) processed
13 PDF(s) processed
14 PDF(s) processed
15 PDF(s) processed
16 PDF(s) processed
17 PDF(s) processed
18 PDF(s) processed
19 PDF(s) processed
20 PDF(s) processed
21 PDF(s) processed
22 PDF(s) processed
23 PDF(s) processed
24 PDF(s) processed
25 PDF(s) processed
26 PDF(s) processed
27 PDF(s) processed
28 PDF(s) processed
29 PDF(s) processed
30 PDF(s) processed
31 PDF(s) processed
32 PDF(s) processed
33 PDF(s) processed
34 PDF(s) processed
35 PDF(s) processed
36 PDF(s) processed
37 PDF(s) processed
38 PDF(s) processed
39 PDF(s) processed
40PDF #40 was skipped (VO1055_OS_20181004_SFA_24-2_0809.pdf)
41 PDF(s) processed
42PDF #42 was skipped (VO1505_OS_20181004_SFA_24-2_0816.pdf)
43PDF #43 was skipped (VO0220_OS_20181004_SFA_24-2_0827.pdf)
44 PDF(s) processed


In [None]:
# Save the batch of files as a csv
DF = pd.concat(CSVS_242)
DF.to_csv(os.path.join(output_dir, "24_2.csv"), index = False)