### Textract Optical Character Recognition for PDFs

#### Imports

In [28]:
# AWS imports
import boto3
import time

# Processing imports
import re
import os
import json
import numpy as np
import pandas as pd
import datetime as dt
import shutil

# Image imports
import PIL.Image as Image
from pdf2image import convert_from_path

#### Upload Documents to the AWS Bucket

The below may take a few minutes to run, all documents in the specified folder will be uploaded to the cloud

In [29]:
# Set to the directory containing the pdfs
input_dir = "/Users/nicolebrye/Desktop/HGC/Data_Management/pdf-ocr/SD4000s_20220426_24-2"

# Set to the directory where the pdfs will be stored
output_dir = "/Users/nicolebrye/Desktop/HGC/Data_Management/pdf-ocr/Test_PDFs"

# Set to the name of the AWS S3 bucket containing the pdfs
s3BucketName = "pdf-ocr-bucket"

# Set to the name of the AWS S3 bucket containing sub-images if needed
s3ImageName  = "pdf-ocr-images"

In [30]:
# Upload the documents to the cloud
PDFS = os.listdir(input_dir)

# Call s3 as a resource
s3 = boto3.resource("s3")

for pdf in PDFS:
    data = open(os.path.join(input_dir, pdf), "rb")
    s3.Bucket(s3BucketName).put_object(Key = pdf, Body = data)

#### Important Global Variables

In [31]:
# Column names for output
cols = ['Batch_UID', 'Exam_UID', 'InstrumentModel', 'InstrumentSerialNumber', 
        'InstrumentSoftwareVersion', 'PatientID', 'GIVEN_NAME', 'LAST_NAME', 
        'StudyCode', 'aeDOB', 'Eye', 'SeriesDateTime',
        'aeExamDate', 'aeExamTime', 'ExamDuration', 'aeIsShileyClinicHFAExam', 'aeDIGSTestType',
        'TestType', 'TestPattern', 'TestStrategy', 'StimulusColor', 'StimulusSize', 
        'BackgroundColor', 'FixationTarget', 'FixationMonitor', 'TrialRXSphereRaw', 
        'TrialRXCylRaw', 'TrialRXAxisRaw', 'PupilDiameter', 'VAType', 'BlindSpotX','BlindSpotY',
        'BlindSpotStimulusSize', 'FalseNegativePercent', 'FalsePositivePercent',
        'aeFixationCheckPercentage', 'FovealResult', 'FovealThreshold', 'ClinicalNotes',
        'SFStatus', 'SF', 'SFProb', 'SWAPFTGeneralHeight', 'GHTType', 'MD', 'MDProb', 'PSD',
        'PSDProb', 'CPSD', 'CPSDProb', 'FovealThresholdProb', 'aePDPCen4LT5Count', 
        'aeHasHighRawThreshold', 'FLAGAssessment', 'FLAGSeverity', 'AGISScore', 'AGISNas', 
        'AGISInf', 'AGISSup', 'GHSupThrSum', 'GHSupThrMean', 'GHSupThrStd', 'GHSupTDSum',
        'GHSupTDMean', 'GHSupTDStd', 'GHSupPDSum', 'GHSupPDMean', 'GHSupPDStd', 'GHSupPDCntLT50p',
        'GHSupPDCntLT10p', 'GHSupNasThrSum', 'GHSupNasThrMean', 'GHSupNasThrStd', 'GHSupNasTDSum',
        'GHSupNasTDMean', 'GHSupNasTDStd', 'GHSupNasPDSum', 'GHSupNasPDMean', 'GHSupNasPDStd',
        'GHSupNasPDCntLT50p', 'GHSupNasPDCntLT10p', 'GHInfThrSum', 'GHInfThrMean', 'GHInfThrStd',
        'GHInfTDSum', 'GHInfTDMean', 'GHInfTDStd', 'GHInfPDSum', 'GHInfPDMean', 'GHInfPDStd',
        'GHInfPDCntLT50p', 'GHInfPDCntLT10p', 'GHInfNasThrSum', 'GHInfNasThrMean', 'GHInfNasThrStd',
        'GHInfNasTDSum', 'GHInfNasTDMean', 'GHInfNasTDStd', 'GHInfNasPDSum', 'GHInfNasPDMean',
        'GHInfNasPDStd', 'GHInfNasPDCntLT50p', 'GHInfNasPDCntLT10p', 'GHCentralThrSum',
        'GHCentralThrMean', 'GHCentralThrStd', 'GHCentralTDSum', 'GHCentralTDMean', 'GHCentralTDStd',
        'GHCentralPDSum', 'GHCentralPDMean', 'GHCentralPDStd', 'GHCentralPDCntLT50p', 
        'GHCentralPDCntLT10p', 'GHTemporalThrSum', 'GHTemporalThrMean', 'GHTemporalThrStd',
        'GHTemporalTDSum', 'GHTemporalTDMean', 'GHTemporalTDStd', 'GHTemporalPDSum', 
        'GHTemporalPDMean', 'GHTemporalPDStd', 'GHTemporalPDCntLT50p', 'GHTemporalPDCntLT10p',
        'N9_S27_Thr', 'N3_S27_Thr', 'T3_S27_Thr', 'T9_S27_Thr', 'N15_S21_Thr', 'N9_S21_Thr',
        'N3_S21_Thr', 'T3_S21_Thr', 'T9_S21_Thr', 'T15_S21_Thr', 'N21_S15_Thr', 'N15_S15_Thr', 
        'N9_S15_Thr', 'N3_S15_Thr', 'T3_S15_Thr', 'T9_S15_Thr', 'T15_S15_Thr', 'T21_S15_Thr',
        'N27_S9_Thr', 'N21_S9_Thr', 'N15_S9_Thr', 'N9_S9_Thr', 'N3_S9_Thr', 'T3_S9_Thr', 'T9_S9_Thr',
        'T15_S9_Thr', 'T21_S9_Thr', 'T27_S9_Thr', 'N27_S3_Thr', 'N21_S3_Thr', 'N15_S3_Thr', 
        'N9_S3_Thr',  'N3_S3_Thr', 'T3_S3_Thr', 'T9_S3_Thr', 'T21_S3_Thr', 'T27_S3_Thr', 
        'N27_I3_Thr', 'N21_I3_Thr', 'N15_I3_Thr', 'N9_I3_Thr', 'N3_I3_Thr', 'T3_I3_Thr', 'T9_I3_Thr', 
        'T21_I3_Thr', 'T27_I3_Thr', 'N27_I9_Thr', 'N21_I9_Thr', 'N15_I9_Thr', 'N9_I9_Thr', 
        'N3_I9_Thr', 'T3_I9_Thr', 'T9_I9_Thr', 'T15_I9_Thr', 'T21_I9_Thr', 'T27_I9_Thr', 
        'N21_I15_Thr', 'N15_I15_Thr', 'N9_I15_Thr', 'N3_I15_Thr', 'T3_I15_Thr', 'T9_I15_Thr', 
        'T15_I15_Thr', 'T21_I15_Thr', 'N15_I21_Thr', 'N9_I21_Thr', 'N3_I21_Thr', 'T3_I21_Thr',
        'T9_I21_Thr', 'T15_I21_Thr', 'N9_I27_Thr', 'N3_I27_Thr', 'T3_I27_Thr', 'T9_I27_Thr', 
        'N9_S27_TD', 'N3_S27_TD', 'T3_S27_TD', 'T9_S27_TD', 'N15_S21_TD', 'N9_S21_TD', 'N3_S21_TD', 
        'T3_S21_TD', 'T9_S21_TD', 'T15_S21_TD', 'N21_S15_TD', 'N15_S15_TD', 'N9_S15_TD', 'N3_S15_TD',
        'T3_S15_TD', 'T9_S15_TD', 'T15_S15_TD', 'T21_S15_TD', 'N27_S9_TD', 'N21_S9_TD', 'N15_S9_TD', 
        'N9_S9_TD', 'N3_S9_TD', 'T3_S9_TD', 'T9_S9_TD', 'T15_S9_TD', 'T21_S9_TD', 'T27_S9_TD', 
        'N27_S3_TD', 'N21_S3_TD', 'N15_S3_TD', 'N9_S3_TD', 'N3_S3_TD', 'T3_S3_TD', 'T9_S3_TD', 
        'T21_S3_TD', 'T27_S3_TD', 'N27_I3_TD', 'N21_I3_TD', 'N15_I3_TD', 'N9_I3_TD', 'N3_I3_TD',
        'T3_I3_TD', 'T9_I3_TD', 'T21_I3_TD', 'T27_I3_TD', 'N27_I9_TD', 'N21_I9_TD', 'N15_I9_TD', 
        'N9_I9_TD', 'N3_I9_TD', 'T3_I9_TD', 'T9_I9_TD', 'T15_I9_TD', 'T21_I9_TD', 'T27_I9_TD',
        'N21_I15_TD', 'N15_I15_TD', 'N9_I15_TD', 'N3_I15_TD', 'T3_I15_TD', 'T9_I15_TD', 'T15_I15_TD',
        'T21_I15_TD', 'N15_I21_TD', 'N9_I21_TD', 'N3_I21_TD', 'T3_I21_TD', 'T9_I21_TD', 'T15_I21_TD',
        'N9_I27_TD', 'N3_I27_TD', 'T3_I27_TD', 'T9_I27_TD', 'N9_S27_PD', 'N3_S27_PD', 'T3_S27_PD', 
        'T9_S27_PD', 'N15_S21_PD', 'N9_S21_PD', 'N3_S21_PD', 'T3_S21_PD', 'T9_S21_PD', 'T15_S21_PD',
        'N21_S15_PD', 'N15_S15_PD', 'N9_S15_PD', 'N3_S15_PD', 'T3_S15_PD', 'T9_S15_PD', 'T15_S15_PD',
        'T21_S15_PD', 'N27_S9_PD', 'N21_S9_PD', 'N15_S9_PD', 'N9_S9_PD', 'N3_S9_PD', 'T3_S9_PD', 
        'T9_S9_PD', 'T15_S9_PD', 'T21_S9_PD', 'T27_S9_PD', 'N27_S3_PD', 'N21_S3_PD', 'N15_S3_PD', 
        'N9_S3_PD', 'N3_S3_PD', 'T3_S3_PD', 'T9_S3_PD', 'T21_S3_PD', 'T27_S3_PD', 'N27_I3_PD',
        'N21_I3_PD', 'N15_I3_PD', 'N9_I3_PD', 'N3_I3_PD', 'T3_I3_PD', 'T9_I3_PD', 'T21_I3_PD', 
        'T27_I3_PD', 'N27_I9_PD', 'N21_I9_PD', 'N15_I9_PD', 'N9_I9_PD', 'N3_I9_PD', 'T3_I9_PD', 
        'T9_I9_PD', 'T15_I9_PD', 'T21_I9_PD', 'T27_I9_PD', 'N21_I15_PD', 'N15_I15_PD', 'N9_I15_PD',
        'N3_I15_PD', 'T3_I15_PD', 'T9_I15_PD', 'T15_I15_PD', 'T21_I15_PD', 'N15_I21_PD', 'N9_I21_PD', 
        'N3_I21_PD', 'T3_I21_PD', 'T9_I21_PD', 'T15_I21_PD', 'N9_I27_PD', 'N3_I27_PD', 'T3_I27_PD',
        'T9_I27_PD', 'N9_S27_TDP', 'N3_S27_TDP', 'T3_S27_TDP', 'T9_S27_TDP', 'N15_S21_TDP', 'N9_S21_TDP', 
        'N3_S21_TDP', 'T3_S21_TDP', 'T9_S21_TDP', 'T15_S21_TDP', 'N21_S15_TDP', 'N15_S15_TDP', 
        'N9_S15_TDP', 'N3_S15_TDP', 'T3_S15_TDP', 'T9_S15_TDP', 'T15_S15_TDP', 'T21_S15_TDP', 
        'N27_S9_TDP', 'N21_S9_TDP', 'N15_S9_TDP', 'N9_S9_TDP', 'N3_S9_TDP', 'T3_S9_TDP', 'T9_S9_TDP', 
        'T15_S9_TDP', 'T21_S9_TDP', 'T27_S9_TDP', 'N27_S3_TDP', 'N21_S3_TDP', 'N15_S3_TDP', 'N9_S3_TDP', 
        'N3_S3_TDP', 'T3_S3_TDP', 'T9_S3_TDP', 'T21_S3_TDP', 'T27_S3_TDP', 'N27_I3_TDP', 'N21_I3_TDP',
        'N15_I3_TDP', 'N9_I3_TDP', 'N3_I3_TDP', 'T3_I3_TDP', 'T9_I3_TDP', 'T21_I3_TDP', 'T27_I3_TDP',
        'N27_I9_TDP', 'N21_I9_TDP', 'N15_I9_TDP', 'N9_I9_TDP', 'N3_I9_TDP', 'T3_I9_TDP', 'T9_I9_TDP',
        'T15_I9_TDP', 'T21_I9_TDP', 'T27_I9_TDP', 'N21_I15_TDP', 'N15_I15_TDP', 'N9_I15_TDP', 
        'N3_I15_TDP', 'T3_I15_TDP', 'T9_I15_TDP', 'T15_I15_TDP', 'T21_I15_TDP', 'N15_I21_TDP', 
        'N9_I21_TDP', 'N3_I21_TDP', 'T3_I21_TDP', 'T9_I21_TDP', 'T15_I21_TDP', 'N9_I27_TDP', 
        'N3_I27_TDP', 'T3_I27_TDP', 'T9_I27_TDP', 'N9_S27_PDP', 'N3_S27_PDP', 'T3_S27_PDP', 
        'T9_S27_PDP', 'N15_S21_PDP', 'N9_S21_PDP', 'N3_S21_PDP', 'T3_S21_PDP', 'T9_S21_PDP', 
        'T15_S21_PDP', 'N21_S15_PDP', 'N15_S15_PDP', 'N9_S15_PDP', 'N3_S15_PDP', 'T3_S15_PDP', 
        'T9_S15_PDP', 'T15_S15_PDP', 'T21_S15_PDP', 'N27_S9_PDP', 'N21_S9_PDP', 'N15_S9_PDP',
        'N9_S9_PDP', 'N3_S9_PDP', 'T3_S9_PDP', 'T9_S9_PDP', 'T15_S9_PDP', 'T21_S9_PDP', 'T27_S9_PDP', 
        'N27_S3_PDP', 'N21_S3_PDP', 'N15_S3_PDP', 'N9_S3_PDP', 'N3_S3_PDP', 'T3_S3_PDP', 
        'T9_S3_PDP', 'T21_S3_PDP', 'T27_S3_PDP', 'N27_I3_PDP', 'N21_I3_PDP', 'N15_I3_PDP',
        'N9_I3_PDP', 'N3_I3_PDP', 'T3_I3_PDP', 'T9_I3_PDP', 'T21_I3_PDP', 'T27_I3_PDP',
        'N27_I9_PDP', 'N21_I9_PDP', 'N15_I9_PDP', 'N9_I9_PDP', 'N3_I9_PDP', 'T3_I9_PDP', 
        'T9_I9_PDP', 'T15_I9_PDP', 'T21_I9_PDP', 'T27_I9_PDP', 'N21_I15_PDP', 'N15_I15_PDP',
        'N9_I15_PDP', 'N3_I15_PDP', 'T3_I15_PDP', 'T9_I15_PDP', 'T15_I15_PDP', 'T21_I15_PDP',
        'N15_I21_PDP', 'N9_I21_PDP', 'N3_I21_PDP', 'T3_I21_PDP', 'T9_I21_PDP', 'T15_I21_PDP',
        'N9_I27_PDP', 'N3_I27_PDP', 'T3_I27_PDP', 'T9_I27_PDP', 'cAutoQCStatus', 'QCFieldUsable',
        'QCReliable', 'cQCFN33Status', 'cQCAHSManualStatus', 'cQCRimArtifactStatus',
        'cQCInattentionStatus', 'cQCLearningEffectStatus', 'cQCFatigueStatus', 'cQCFixationStatus', 
        'cQCOtherDefectStatus', 'cQCUnreliableByTechnicianStatus', 'cQCUnaccPupilSizeStatus','VFI',
        'kPrevUsable_ExamTimeStamp', 'aeExamTimeStamp', 'kNextUsable_ExamTimeStamp', 
        'kPrevUsable_FLAGAssessment', 'kNextUsable_FLAGAssessment', 
        'cFLAG_Confirmation_Status_ByTestType', 'cIsABNORMAL_FLAG_Confirmed_ByTestType',
        'cIsNORMAL_FLAG_Confirmed_ByTestType', 'cCnt_TDP_LessThan5', 'cCnt_PDP_LessThan5',
        'cCnt_TDP_LessThan2', 'cCnt_PDP_LessThan2', 'cCnt_TDP_LessThan1', 'cCnt_PDP_LessThan1',
        'cCnt_TDP_LessThan05', 'cCnt_PDP_LessThan05', 'kUsedADAGESBL09','sFLAGAbn3ConsecConfirmed',
        'sFLAGAbn3ConsecUnconfirmed', 'sFLAGNorm3ConsecConfirmed', 'sFLAGNorm3ConsecUnconfirmed',
        'LowPatientReliabilityStatus']

In [32]:
# Initialize the s3 bucket
bucket = s3.Bucket(s3BucketName)

# Loop through all the files in the bucket and append them to a list
files = []
for i in bucket.objects.all():
    files.append(i.key)

#### Extract Text Function

This function takes a PDF or an image from an S3 bucket, and extracts its text using AWS Textract

In [33]:
def extract_text(bucket_name, filename):
    
    response = client.start_document_text_detection(
        DocumentLocation={
            'S3Object': {
                'Bucket': bucket_name,
                'Name': filename
            }
        })
    
    jobId = response["JobId"]
    
    # Extract text
    textractmodule = client.get_document_text_detection(JobId=jobId)
    
    # Check if the OCR is complete
    status = textractmodule["JobStatus"]
    
    while(status == "IN_PROGRESS"):
        time.sleep(5)
        textractmodule = client.get_document_text_detection(JobId=jobId)
        status = textractmodule["JobStatus"]
        
    # Extract text from the module
    text = []
    for block in textractmodule["Blocks"]:
        if block["BlockType"] == "LINE":
            text.append(block["Text"])
            
    return text


#### Key Value Function

This function converts the raw text of the PDF to key value pairs which can have information extracted easily. This must be called before the raw text function.

In [34]:
def key_value_output(text):
    
    # Convert text into key value pairs
    temp = {}
    for i in range(len(text)):
    
        val = text[i].split(":")
    
        if len(val) == 2:
            if val[1] == "":
                temp[val[0].strip()] = text[i+1].strip().strip(",")
            else:
                temp[val[0].strip()] = val[1].strip().strip(",")
                
    # Save the keys in a Series
    KEYS = pd.Series(temp.keys())
                
    # Initialize pandas DataFrame
    output = pd.DataFrame(columns=cols, index=[0])
    
    # Important dates
    DOB = pd.to_datetime(temp["Date of Birth"])
    VIS = pd.to_datetime(temp["Date"])

    # Patient Info
    output["PatientID"]  = temp["Patient ID"]
    
    try:
        output["GIVEN_NAME"] = temp["Patient"].split(",")[1]
    except IndexError:
        output["GIVEN_NAME"] = ""
        
    output["LAST_NAME"]  = temp["Patient"]
    output["aeDOB"] = str(DOB.month) + "/" + str(DOB.day) + "/" + str(DOB.year) #[-2:]
    
    # Visit Info
    output["aeExamTime"] = temp["Time"].split()[0]
    output["aeExamDate"] = str(VIS.month) + "/" + str(VIS.day) + "/" + str(VIS.year) #[-2:]
    output["TestStrategy"] = temp["Strategy"]
    output["ExamDuration"] = temp["Test Duration"]
    
    # Eye Info
    try:
        output["VAType"] = float(temp["Visual Acuity"])
    except:
        pass
    
    MDKEY  = KEYS[KEYS.str.contains("MD").idxmax()]
    PSDKEY = KEYS[KEYS.str.contains("PSD").idxmax()]
    
    # MD and MDProb
    if temp[MDKEY].endswith("dB"):
        output["MD"]     = temp[MDKEY].strip(" dB")
        output["MDProb"] = "Not Significant"
        
    elif temp[MDKEY].endswith("%"):
        output["MD"]     = temp[MDKEY].split(" dB ")[0]
        output["MDProb"] = temp[MDKEY].split(" dB ")[1]

    
    # PSD and PSDProb
    if temp[PSDKEY].endswith("dB"):
        output["PSD"]     = temp[PSDKEY].strip(" dB")
        output["PSDProb"] = "Not Significant"
        
    elif temp[PSDKEY].endswith("%"):
        output["PSD"]     = temp[PSDKEY].split(" dB ")[0]
        output["PSDProb"] = temp[PSDKEY].split(" dB ")[1]

    
    # Other Info
    output["FixationTarget"]  = temp["Fixation Target"]
    output["FixationMonitor"] = temp["Fixation Monitor"]
    output["StimulusSize"]    = temp["Stimulus"].split(", ")[0]
    output["StimulusColor"]   = temp["Stimulus"].split()[1]
    output["BackgroundColor"] = output["StimulusColor"] + " (" + temp["Background"] + ")"
    output["PupilDiameter"]   = temp["Pupil Diameter"].split()[0]
    output["FovealThreshold"] = temp["Fovea"].split()[0]
    output["GHTType"]         = temp["GHT"]
    
    try:
        output["VFI"] = temp["VFI"].strip("%")
    except:
        pass
    
    err, trials = temp["Fixation Losses"].split()[0].split("/")
    output["aeFixationCheckPercentage"] = round(float(err) / float(trials) * 100, 2)
    output["FalsePositivePercent"] = temp["False POS Errors"].strip(" XX").strip("%")
    output["FalseNegativePercent"] = temp["False NEG Errors"].strip(" XX").strip("%")
    
    return output

#### Close-Up Function

This function is used for the Total Deviation and Pattern Deviation points that the AWS OCR would otherwise not be able to extract. It takes close up screenshots of the pdf in order to more accurately extract the information.

In [35]:
def close_up(bucket_name, filename, mode = 1):
    
    img_fn = filename.strip(".pdf") + ".jpg"
    img_fp = os.path.join(input_dir, "images")
    
    # Convert the pdf to an image
    path = os.path.join(input_dir, filename)
    img  = np.array(convert_from_path(path)[0])
    
    # Subset the image for TD and PD
    if mode == 1:
        img1 = img[1120:1390, 180:480]
        img2 = img[1120:1390, 690:990]
    if mode == 2:
        img1 = img[1130:1390, 190:480]
        img2 = img[1130:1390, 700:990]
    
    # Create a new directory to store the images if it doesn't exist
    if not os.path.isdir(img_fp):
        os.makedirs(img_fp)
    
    # Save the images to upload them to the s3 bucket
    Image.fromarray(img1).save(os.path.join(img_fp, ("TD_" + img_fn)))
    Image.fromarray(img2).save(os.path.join(img_fp, ("PD_" + img_fn)))
    
    # Upload images to the s3 bucket
    data1 = open(os.path.join(img_fp, ("TD_" + img_fn)), "rb")
    data2 = open(os.path.join(img_fp, ("PD_" + img_fn)), "rb")
    
    s3.Bucket(s3ImageName).put_object(Key = ("TD_" + img_fn), Body = data1)
    s3.Bucket(s3ImageName).put_object(Key = ("PD_" + img_fn), Body = data2)
    
    # Extract text from the images
    text1 = extract_text(s3ImageName, ("TD_" + img_fn))
    text2 = extract_text(s3ImageName, ("PD_" + img_fn))
    
    # Define a helper function to clean the values
    def _clean_points(values):
        
        output = []
        for i in values:
            clean = re.findall("(\d+|\-\d+)", i)
            output = output + clean
            
        return output
    
    TD = _clean_points(text1)
    PD = _clean_points(text2)
    
    return TD, PD


#### Raw Text Function

This function takes an output DataFrame provided by the key value function and a list of text from the pdf. This function analyzes the raw text to extract values from the pdf.

In [42]:
def raw_text_output(output, text, filename):
    
    # Find the thr points as well as other important information
    POINTS = []
    
    for i, x in enumerate(text):
    
        # Use regexes to extract the necessary information
        point  = re.search("^\-?\<?\d?[\d?\s?\-?]*\d$", x)
        eye    = re.search("(^OD$|^OS$)", x)
        test   = re.search("\s\d{2}-\d\s", x)
        inst   = re.search("HFA", x)
        vers   = re.search("Version", x)
        rx     = re.search("Rx:\s.*", x)
        field  = re.search("°", x)
        status = re.search("[A-z\s]+\*{3}", x)

        # Format the information extracted from regexes
        if point:
            clean = re.findall("(<0|\d+|\-\d+)", x)
            POINTS = POINTS + clean

        if eye:
            Eye = eye.group(0)

        if test:
            words = x.split()

            TestPattern = " ".join([words[0], words[1]])
            TestType    = words[2]

        if inst:
            inst_info = x.split()

            InstrumentName = " ".join([inst_info[0], inst_info[1]])
            InstrumentModel = inst_info[-1].split("-")[0]
            InstrumentSerialNumber = inst_info[-1].split("-")[1][:4]
            InstrumentSoftwareVersion = inst_info[-1].split("/")[-1]

        if vers:
            Version = x.split()[1]  

        if rx:
            rxval = rx.group(0)
            TrialRXSphereRaw = re.findall("(\d+\.\d+)\s?DS", rxval)[0]
            
            try:
                TrialRXCylRaw    = re.findall("(\d+\.\d+)\s?DC", rxval)[0]
            except:
                TrialRXCylRaw    = ""
                
            try:
                TrialRXAxisRaw   = re.findall("X(.*)", rxval)[0]
            except:
                TrialRXAxisRaw   = ""

        if field:
            FieldSize = x.strip()
            
        if status:
            LowPatientReliabilityStatus = status.group(0).strip("***").strip()
            
    # Order the plot points
    if Eye == "OS":
        ORDER = ['T9_S21', 'T3_S21', 'N3_S21', 'N9_S21', 'T15_S15',
                 'T9_S15', 'T3_S15', 'N3_S15', 'N9_S15', 'N15_S15',
                 'T21_S9', 'T15_S9', 'T9_S9', 'T3_S9', 'N3_S9','N9_S9',
                 'N15_S9', 'N21_S9', 'T21_S3', 'T15_S3', 'T9_S3','T3_S3',
                 'N3_S3', 'N9_S3', 'N15_S3', 'N21_S3', 'N27_S3', 
                 'T21_I3', 'T15_I3', 'T9_I3', 'T3_I3', 'N3_I3', 'N9_I3',
                 'N15_I3', 'N21_I3', 'N27_I3', 'T21_I9', 'T15_I9',
                 'T9_I9', 'T3_I9', 'N3_I9', 'N9_I9', 'N15_I9', 'N21_I9',
                 'T15_I15', 'T9_I15', 'T3_I15', 'N3_I15', 'N9_I15',
                 'N15_I15', 'T9_I21', 'T3_I21', 'N3_I21', 'N9_I21'
                ]
        blind_spots = [19, 28]
        
    else:
        ORDER = ['N9_S21', 'N3_S21', 'T3_S21', 'T9_S21', 'N15_S15',
                 'N9_S15', 'N3_S15', 'T3_S15', 'T9_S15', 'T15_S15',
                 'N21_S9', 'N15_S9', 'N9_S9', 'N3_S9', 'T3_S9',
                 'T9_S9', 'T15_S9', 'T21_S9', 'N27_S3', 'N21_S3',
                 'N15_S3', 'N9_S3', 'N3_S3', 'T3_S3', 'T9_S3',
                 'T15_S3', 'T21_S3', 'N27_I3', 'N21_I3', 'N15_I3',
                 'N9_I3', 'N3_I3', 'T3_I3', 'T9_I3', 'T15_I3',
                 'T21_I3', 'N21_I9', 'N15_I9', 'N9_I9', 'N3_I9',
                 'T3_I9', 'T9_I9', 'T15_I9', 'T21_I9', 'N15_I15',
                 'N9_I15', 'N3_I15', 'T3_I15', 'T9_I15', 'T15_I15',
                 'N9_I21', 'N3_I21', 'T3_I21', 'T9_I21']
        blind_spots = [25, 34]
            
    # Define a helper function for Total Deviation and Pattern Deviation Points
    def _subset_points(TDPD):
        
        TD = TDPD[:4] + TDPD[8:14] + TDPD[20:28] + TDPD[36:44] + TDPD[52:60]\
        + TDPD[68:76] + TDPD[84:90] + TDPD[96:100]
        PD = TDPD[4:8] + TDPD[14:20] + TDPD[28:36] + TDPD[44:52] + TDPD[60:68]\
        + TDPD[76:84] + TDPD[90:96] + TDPD[100:]
        
        return TD, PD
            
    # Remove age and subset the points accordingly
    POINTS = POINTS[1:]
    
    if len(POINTS) == 158:
        THR  = POINTS[:54]
        TDPD = POINTS[54:]
        
        TD, PD = _subset_points(TDPD)
        
        del THR[blind_spots[1]]
        del THR[blind_spots[0]]
    
    elif (int(POINTS[53])) <= 3 and (len(POINTS) == 157):
        THR  = POINTS[:53]
        TDPD = POINTS[53:]
        
        TD, PD = _subset_points(TDPD)
        
        del THR[blind_spots[0]]
    
    else:
        THR    = POINTS[:54]
        
        del THR[blind_spots[1]]
        del THR[blind_spots[0]]
        
        TD, PD = close_up(s3BucketName, filename)
        
    # If the TD and PD lengths aren't correct, try a different mode
    if (len(TD) != 52) or (len(PD) != 52):
        TD, PD = close_up(s3BucketName, filename, 2)
        
    ORDER.remove("T15_S3")
    ORDER.remove("T15_I3")

    thr_order = [x + "_Thr" for x in ORDER]
    td_order  = [x + "_TD" for x in ORDER]
    pd_order  = [x + "_PD" for x in ORDER]

    # Populate the data frame with thr points
    output[thr_order] = THR

    # Populate the data frame with td points
    try:
        output[td_order] = TD
    except:
        pass

    # Populate the data frame with pd points
    try:
        output[pd_order] = PD
    except:
        pass
    
    # Populate the data frame with other acquired info
    output["Eye"]              = Eye
    output["TestType"]         = TestType
    output["TestPattern"]      = TestPattern
    output["TrialRXSphereRaw"] = TrialRXSphereRaw
    output["TrialRXCylRaw"]    = TrialRXCylRaw
    output["TrialRXAxisRaw"]   = TrialRXAxisRaw

    output["InstrumentModel"]  = InstrumentModel
    output["InstrumentSerialNumber"] = InstrumentSerialNumber
    output["InstrumentSoftwareVersion"] = InstrumentSoftwareVersion
    
    try:
        output["LowPatientReliabilityStatus"] = LowPatientReliabilityStatus
    except:
        pass
        
    return output, TestPattern, ORDER, blind_spots

#### Probability Points

The function below uses average pixel values in the pdf to determine the values of the probability points

In [104]:
def probability_points(filename, start1, end1, start2, end2, box_width):
    
    # Convert the pdf to an image
    img = np.array(convert_from_path(os.path.join(input_dir, filename))[0])
    
    # Loop through each box and calculate mean pixel value
    PROBS = []
    for i in np.arange(start1, end1, box_width):
        for j in np.arange(start2, end2, box_width):
            
            point = img[i:(i+box_width), j:(j+box_width)]
            value = point[2:(box_width - 2)].mean()
            
            # Determine the probability value according to the mean pixel value
            if value == 255.0:
                continue
            if value < 155.0:
                PROBS.append("P < 0.5%")
            if (value >= 155.0 and value < 190.0):
                PROBS.append("P < 1%")
            if (value >= 190.0 and value < 225.0):
                PROBS.append("P < 2%")
            if (value >= 225.0 and value < 250.0):
                PROBS.append("P < 5%")
            if (value >= 250.0 and value < 255.0):
                PROBS.append("Not Significant")
                
    return PROBS

#### Extracting information from the pdfs

The code below puts everything together; text will be extracted from the pdfs, cleaned, then formatted as a table

In [105]:
# Start up the client
client = boto3.client('textract', 'us-east-1')

CSVS_242 = []
for i, x in enumerate(files):
    
    # Extract text
    text = extract_text(s3BucketName, x)
    
    # Generate the first round of outputs
    output = key_value_output(text)
    
    # Generate the second round of outputs
    output, TestPattern, ORDER, blind_spots = raw_text_output(output, text, x)
    
    # Calculate the probability points for both plots
    TDP = probability_points(x, 1498, 1762, 188, 485, 33)
    PDP = probability_points(x, 1498, 1762, 692, 989, 33)

    tdp_order = [x + "_TDP" for x in ORDER]
    pdp_order = [x + "_PDP" for x in ORDER]
    
    output[tdp_order] = TDP
    output[pdp_order] = PDP
    
    # Append to a list based on test type
    if TestPattern == "Central 24-2":
        CSVS_242.append(output)
        
    print(str(i + 1) + " PDF(s) processed")

1 PDF(s) processed
2 PDF(s) processed
3 PDF(s) processed


In [106]:
# Save the batch of files as a csv and delete the images folder
DF = pd.concat(CSVS_242)
DF.to_csv(os.path.join(output_dir, "24_2.csv"), index = False)

shutil.rmtree(os.path.join(input_dir, "images"))