#### Imports

In [1]:
import boto3
from trp import Document

import re
import os
import json
import numpy as np
import pandas as pd
import datetime as dt

#### Interaction with AWS

In [2]:
# Document names
s3BucketName = "textract-console-us-east-1-ac45ba84-46aa-4f6d-a329-c1b5067c0763"
PlaindocumentName = "VO00005.pdf"

In [3]:
# Configuration and client
textractmodule = boto3.client("textract", 'us-east-1')

In [6]:
response = textractmodule.start_document_text_detection(
    DocumentLocation={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': PlaindocumentName
        }
    })

InvalidS3ObjectException: An error occurred (InvalidS3ObjectException) when calling the StartDocumentTextDetection operation: Unable to get object metadata from S3. Check object key, region and/or access permissions.

In [4]:
# Detect text from document
response = textractmodule.detect_document_text(
    Document={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': PlaindocumentName
        }
    })

print ('------------- Print Plaintext detected text ------------------------------')
for item in response["Blocks"]:
    if item["BlockType"] == "LINE":
        print ('\033[92m'+item["Text"]+'\033[92m')

InvalidS3ObjectException: An error occurred (InvalidS3ObjectException) when calling the DetectDocumentText operation: Unable to get object metadata from S3. Check object key, region and/or access permissions.

#### Cleaning the Result: Key Value Pairs

In [5]:
fp = "/Users/nicolebrye/Desktop/HGC/Data_Management/pdf-ocr/SD4000s_20220426_24-2"

In [6]:
cols = ["HGCParserVersion", "XMLConversionTimeStamp", "HFA2_SOFTWARE_VERSION", "HFA_XML_VERSION",
        "PATIENT_ID", "FULL_NAME", "GIVEN_NAME", "MIDDLE_NAME", "LAST_NAME", "NAME_PREFIX",
        "NAME_SUFFIX", "BIRTH_DATE", "VISIT_DATE", "STUDY_UID", "SERIES_DATE_TIME", "MODALITY",
        "SITE", "INSTRUMENT_NAME", "INSTRUMENT_MANUFACTURER", "INSTRUMENT_MODEL_NUMBER",
        "INSTRUMENT_SERIAL_NUMBER", "INSTRUMENT_SOFTWARE_VERSION", "DISPLAY_NAME",
        "CLINICAL_NOTES", "EXAM_TIME", "CD_HORIZONTAL", "CD_VERTICAL", "IOP", "TRIAL_RX_SPHERE",
        "TRIAL_RX_CYLINDER", "TRIAL_RX_AXIS", "DISTANCE_RX_SPHERE", "DISTANCE_RX_CYLINDER",
        "DISTANCE_RX_AXIS", "PUPIL_DIAMETER", "PUPIL_DIAMETER_AUTO", "DIAGNOSIS_CODE",
        "PROCEDURE_CODE", "VA", "VA_STRING", "TEST_TYPE", "IMAGE_TYPE", "IMAGE_FILE_NAME",
        "TEST_PATTERN", "TEST_STRATEGY", "STIMULUS_COLOR", "STIMULUS_SIZE", "BACKGROUND_COLOR",
        "EXAM_DURATION", "FIXATION_TARGET", "FIXATION_MONITOR", "BLIND_SPOT_X", "BLIND_SPOT_Y",
        "BLIND_SPOT_STIMULUS_SIZE", "FOVEAL_RESULT", "FOVEAL_THRESHOLD", "CENTRAL_REF_LEVEL",
        "THROWN_OUT_POINTS", "MINIMUM_STIMULUS", "FIELD_SIZE", "LANGUAGE",
        "FALSE_NEGATIVE_METHOD", "FALSE_NEGATIVE_TRIALS", "FALSE_NEGATIVE_ERRORS",
        "FALSE_NEGATIVE_PERCENT", "FALSE_POSITIVE_TRIALS",
        "FALSE_POSITIVE_ERRORS", "FALSE_POSITIVE_PERCENT", "FIXATION_CHECK_TRIALS",
        "FIXATION_CHECK_ERRORS", "QUESTIONS_ASKED", "REFERENCE_TEST_DATE",
        "REFERENCE_TEST_CODE", "SF_STATUS", "SF", "NUM_THRESHOLD_POINTS",
        "N9_S27_Thr", "N3_S27_Thr", "T3_S27_Thr", "T9_S27_Thr", "N15_S21_Thr", "N9_S21_Thr",
        "N3_S21_Thr", "T3_S21_Thr", "T9_S21_Thr", "T15_S21_Thr", "N21_S15_Thr", "N15_S15_Thr",
        "N9_S15_Thr", "N3_S15_Thr", "T3_S15_Thr", "T9_S15_Thr", "T15_S15_Thr", "T21_S15_Thr",
        "N27_S9_Thr", "N21_S9_Thr", "N15_S9_Thr", "N9_S9_Thr", "N3_S9_Thr", "T3_S9_Thr",
        "T9_S9_Thr", "T15_S9_Thr", "T21_S9_Thr", "T27_S9_Thr", "N27_S3_Thr", "N21_S3_Thr",
        "N15_S3_Thr", "N9_S3_Thr", "N3_S3_Thr", "T3_S3_Thr", "T9_S3_Thr", "T15_S3_Thr",
        "T21_S3_Thr", "T27_S3_Thr", "N27_I3_Thr", "N21_I3_Thr", "N15_I3_Thr", "N9_I3_Thr",
        "N3_I3_Thr", "T3_I3_Thr", "T9_I3_Thr", "T15_I3_Thr", "T21_I3_Thr", "T27_I3_Thr",
        "N27_I9_Thr", "N21_I9_Thr", "N15_I9_Thr", "N9_I9_Thr", "N3_I9_Thr", "T3_I9_Thr",
        "T9_I9_Thr", "T15_I9_Thr", "T21_I9_Thr", "T27_I9_Thr", "N21_I15_Thr", "N15_I15_Thr",
        "N9_I15_Thr", "N3_I15_Thr", "T3_I15_Thr", "T9_I15_Thr", "T15_I15_Thr", "T21_I15_Thr",
        "N15_I21_Thr", "N9_I21_Thr", "N3_I21_Thr", "T3_I21_Thr", "T9_I21_Thr", "T15_I21_Thr",
        "N9_I27_Thr", "N3_I27_Thr", "T3_I27_Thr", "T9_I27_Thr",
        "STATPAC_STATUS", "LOW_PATIENT_RELIABILITY_STATUS", "GHT", "MD", "MD_PROBABILITY",
        "PSD", "PSD_PROBABILITY", "CPSD", "CPSD_PROBABILITY", "SF_PROBABILITY", "VFI",
        "FOVEAL_THRESHOLD_PROBABILITY", "NUM_TOTAL_DEV_VALUE_POINTS",
        "N9_S27_TD", "N3_S27_TD", "T3_S27_TD", "T9_S27_TD", "N15_S21_TD", "N9_S21_TD",
        "N3_S21_TD", "T3_S21_TD", "T9_S21_TD", "T15_S21_TD", "N21_S15_TD", "N15_S15_TD",
        "N9_S15_TD", "N3_S15_TD", "T3_S15_TD", "T9_S15_TD", "T15_S15_TD", "T21_S15_TD",
        "N27_S9_TD", "N21_S9_TD", "N15_S9_TD", "N9_S9_TD", "N3_S9_TD", "T3_S9_TD",
        "T9_S9_TD", "T15_S9_TD", "T21_S9_TD", "T27_S9_TD", "N27_S3_TD", "N21_S3_TD",
        "N15_S3_TD", "N9_S3_TD", "N3_S3_TD", "T3_S3_TD", "T9_S3_TD", "T15_S3_TD",
        "T21_S3_TD", "T27_S3_TD", "N27_I3_TD", "N21_I3_TD", "N15_I3_TD", "N9_I3_TD",
        "N3_I3_TD", "T3_I3_TD", "T9_I3_TD", "T15_I3_TD", "T21_I3_TD", "T27_I3_TD",
        "N27_I9_TD", "N21_I9_TD", "N15_I9_TD", "N9_I9_TD", "N3_I9_TD", "T3_I9_TD",
        "T9_I9_TD", "T15_I9_TD", "T21_I9_TD", "T27_I9_TD", "N21_I15_TD", "N15_I15_TD",
        "N9_I15_TD", "N3_I15_TD", "T3_I15_TD", "T9_I15_TD", "T15_I15_TD", "T21_I15_TD",
        "N15_I21_TD", "N9_I21_TD", "N3_I21_TD", "T3_I21_TD", "T9_I21_TD", "T15_I21_TD",
        "N9_I27_TD", "N3_I27_TD", "T3_I27_TD", "T9_I27_TD",
        "NUM_PATTERN_DEV_VALUE_POINTS",
        "N9_S27_PD", "N3_S27_PD", "T3_S27_PD", "T9_S27_PD", "N15_S21_PD", "N9_S21_PD",
        "N3_S21_PD", "T3_S21_PD", "T9_S21_PD", "T15_S21_PD", "N21_S15_PD", "N15_S15_PD",
        "N9_S15_PD", "N3_S15_PD", "T3_S15_PD", "T9_S15_PD", "T15_S15_PD", "T21_S15_PD",
        "N27_S9_PD", "N21_S9_PD", "N15_S9_PD", "N9_S9_PD", "N3_S9_PD", "T3_S9_PD",
        "T9_S9_PD", "T15_S9_PD", "T21_S9_PD", "T27_S9_PD", "N27_S3_PD", "N21_S3_PD",
        "N15_S3_PD", "N9_S3_PD", "N3_S3_PD", "T3_S3_PD", "T9_S3_PD", "T15_S3_PD",
        "T21_S3_PD", "T27_S3_PD", "N27_I3_PD", "N21_I3_PD", "N15_I3_PD", "N9_I3_PD",
        "N3_I3_PD", "T3_I3_PD", "T9_I3_PD", "T15_I3_PD", "T21_I3_PD", "T27_I3_PD",
        "N27_I9_PD", "N21_I9_PD", "N15_I9_PD", "N9_I9_PD", "N3_I9_PD", "T3_I9_PD",
        "T9_I9_PD", "T15_I9_PD", "T21_I9_PD", "T27_I9_PD", "N21_I15_PD", "N15_I15_PD",
        "N9_I15_PD", "N3_I15_PD", "T3_I15_PD", "T9_I15_PD", "T15_I15_PD", "T21_I15_PD",
        "N15_I21_PD", "N9_I21_PD", "N3_I21_PD", "T3_I21_PD", "T9_I21_PD", "T15_I21_PD",
        "N9_I27_PD", "N3_I27_PD", "T3_I27_PD", "T9_I27_PD",
        "NUM_TOTAL_DEV_PROB_POINTS",
        "N9_S27_TDP", "N3_S27_TDP", "T3_S27_TDP", "T9_S27_TDP", "N15_S21_TDP", "N9_S21_TDP",
        "N3_S21_TDP", "T3_S21_TDP", "T9_S21_TDP", "T15_S21_TDP", "N21_S15_TDP", "N15_S15_TDP",
        "N9_S15_TDP", "N3_S15_TDP", "T3_S15_TDP", "T9_S15_TDP", "T15_S15_TDP", "T21_S15_TDP",
        "N27_S9_TDP", "N21_S9_TDP", "N15_S9_TDP", "N9_S9_TDP", "N3_S9_TDP", "T3_S9_TDP",
        "T9_S9_TDP", "T15_S9_TDP", "T21_S9_TDP", "T27_S9_TDP", "N27_S3_TDP", "N21_S3_TDP",
        "N15_S3_TDP", "N9_S3_TDP", "N3_S3_TDP", "T3_S3_TDP", "T9_S3_TDP", "T15_S3_TDP",
        "T21_S3_TDP", "T27_S3_TDP", "N27_I3_TDP", "N21_I3_TDP", "N15_I3_TDP", "N9_I3_TDP",
        "N3_I3_TDP", "T3_I3_TDP", "T9_I3_TDP", "T15_I3_TDP", "T21_I3_TDP", "T27_I3_TDP",
        "N27_I9_TDP", "N21_I9_TDP", "N15_I9_TDP", "N9_I9_TDP", "N3_I9_TDP", "T3_I9_TDP",
        "T9_I9_TDP", "T15_I9_TDP", "T21_I9_TDP", "T27_I9_TDP", "N21_I15_TDP", "N15_I15_TDP",
        "N9_I15_TDP", "N3_I15_TDP", "T3_I15_TDP", "T9_I15_TDP", "T15_I15_TDP", "T21_I15_TDP",
        "N15_I21_TDP", "N9_I21_TDP", "N3_I21_TDP", "T3_I21_TDP", "T9_I21_TDP", "T15_I21_TDP",
        "N9_I27_TDP", "N3_I27_TDP", "T3_I27_TDP", "T9_I27_TDP",
        "NUM_PATTERN_DEV_PROB_POINTS",
        "N9_S27_PDP", "N3_S27_PDP", "T3_S27_PDP", "T9_S27_PDP", "N15_S21_PDP", "N9_S21_PDP",
        "N3_S21_PDP", "T3_S21_PDP", "T9_S21_PDP", "T15_S21_PDP", "N21_S15_PDP", "N15_S15_PDP",
        "N9_S15_PDP", "N3_S15_PDP", "T3_S15_PDP", "T9_S15_PDP", "T15_S15_PDP", "T21_S15_PDP",
        "N27_S9_PDP", "N21_S9_PDP", "N15_S9_PDP", "N9_S9_PDP", "N3_S9_PDP", "T3_S9_PDP",
        "T9_S9_PDP", "T15_S9_PDP", "T21_S9_PDP", "T27_S9_PDP", "N27_S3_PDP", "N21_S3_PDP",
        "N15_S3_PDP", "N9_S3_PDP", "N3_S3_PDP", "T3_S3_PDP", "T9_S3_PDP", "T15_S3_PDP",
        "T21_S3_PDP", "T27_S3_PDP", "N27_I3_PDP", "N21_I3_PDP", "N15_I3_PDP", "N9_I3_PDP",
        "N3_I3_PDP", "T3_I3_PDP", "T9_I3_PDP", "T15_I3_PDP", "T21_I3_PDP", "T27_I3_PDP",
        "N27_I9_PDP", "N21_I9_PDP", "N15_I9_PDP", "N9_I9_PDP", "N3_I9_PDP", "T3_I9_PDP",
        "T9_I9_PDP", "T15_I9_PDP", "T21_I9_PDP", "T27_I9_PDP", "N21_I15_PDP", "N15_I15_PDP",
        "N9_I15_PDP", "N3_I15_PDP", "T3_I15_PDP", "T9_I15_PDP", "T15_I15_PDP", "T21_I15_PDP",
        "N15_I21_PDP", "N9_I21_PDP", "N3_I21_PDP", "T3_I21_PDP", "T9_I21_PDP", "T15_I21_PDP",
        "N9_I27_PDP", "N3_I27_PDP", "T3_I27_PDP", "T9_I27_PDP"]

In [7]:
# Read the csv file of results
result = pd.read_csv(os.path.join(fp, "keyValues.csv"))

# Create a dictionary of the output
keys   = result["key"].str.strip(": ")
values = result["value"].str.strip(", ")

temp = dict(zip(keys, values))

In [8]:
# Initialize pandas DataFrame
output = pd.DataFrame(columns=cols, index=[0])

# Important dates
DOB = pd.to_datetime(temp["Date of Birth"])
VIS = pd.to_datetime(temp["Date"])

# Patient Info
output["PATIENT_ID"] = temp["Patient ID"]
output["FULL_NAME"]  = temp["Patient"]
output["LAST_NAME"]  = temp["Patient"]
output["BIRTH_DATE"] = str(DOB.month) + "/" + str(DOB.day) + "/" + str(DOB.year)[-2:]

# Visit Info
output["EXAM_TIME"]     = temp["Time"].split()[0]
output["VISIT_DATE"]    = str(VIS.month) + "/" + str(VIS.day) + "/" + str(VIS.year)[-2:]
output["TEST_STRATEGY"] = temp["Strategy"]
output["EXAM_DURATION"] = temp["Test Duration"]

# Eye Info
output["VA"] = temp["Visual Acuity"]
output["MD"] = temp["MD"].strip(" dB")
output["PSD"] = temp["PSD"].strip(" dB")

# Other Info
output["FIXATION_TARGET"]  = temp["Fixation Target"]
output["FIXATION_MONITOR"] = temp["Fixation Monitor"]
output["STIMULUS_SIZE"]    = temp["Stimulus"].split(", ")[0]
output["STIMULUS_COLOR"]   = temp["Stimulus"].split()[1]
output["BACKGROUND_COLOR"] = output["STIMULUS_COLOR"] + " (" + temp["Background"] + ")"
output["PUPIL_DIAMETER"]   = temp["Pupil Diameter"].split()[0]
output["FOVEAL_THRESHOLD"] = temp["Fovea"].strip(" dB")
output["GHT"]              = temp["GHT"]
output["VFI"]              = temp["VFI"].strip("%")

output["FIXATION_CHECK_ERRORS"]  = temp["Fixation Losses"].split("/")[0]
output["FIXATION_CHECK_TRIALS"]  = temp["Fixation Losses"].split("/")[1]
output["FALSE_POSITIVE_PERCENT"] = temp["False POS Errors"].strip("%")
output["FALSE_NEGATIVE_PERCENT"] = temp["False NEG Errors"].strip("%")
output["INSTRUMENT_MANUFACTURER"] = "Carl Zeiss Meditec"

#### Cleaning the Result: Raw Text Document

In [9]:
# Read in the raw text document
textfile = os.path.join(fp, "rawText.txt")

with open(textfile) as f:
    lines = f.readlines()

In [10]:
# Find the thr points as well as other important information
THR = []
for i, x in enumerate(lines):
    
    point = re.search("^\d{2}\\n$", x)
    eye   = re.search("(^OD$|^OS$)", x)
    test  = re.search("\s\d{2}-\d\s", x)
    inst  = re.search("HFA", x)
    vers  = re.search("Version", x)
    rx    = re.search("Rx:\s.*", x)
    field = re.search("°", x)
    
    if point:
        THR.append(point.group(0).strip("\n"))
        
    if eye:
        SITE = eye.group(0).strip("\n")
        
    if test:
        words = x.split()
        
        TEST_PATTERN = " ".join([words[0], words[1]])
        TEST_TYPE    = words[2]
        
    if inst:
        inst_info = x.split()
        
        INSTRUMENT_NAME = " ".join([inst_info[0], inst_info[1]])
        INSTRUMENT_MODEL_NUMBER = inst_info[-1].split("-")[0]
        INSTRUMENT_SERIAL_NUMBER = inst_info[-1].split("-")[1][:4]
        INSTRUMENT_SOFTWARE_VERSION = inst_info[-1].split("/")[-1]
        
    if vers:
        VERSION = x.strip("\n").split()[1]  
        
    if rx:
        TRIAL_RX_SPHERE = rx.group(0).split()[1]
        
    if field:
        FIELD_SIZE = x.strip()
        
# Remove age
THR = THR[1:]

In [11]:
# Find the TD and PD Points
TDPD = []
for i, x in enumerate(lines):
    
    point = re.search("(^\-\d\n|^\d\n|^\-\d\s[^A-z]+\n|\d\s[^A-z]+\n)$", x)
    
    if point:
        value = point.group(0).strip("\n").split()
        TDPD  = TDPD + value
           
# Split into TD and PD
TD = TDPD[:4] + TDPD[8:14] + TDPD[20:28] + TDPD[36:44] + TDPD[52:60] + TDPD[68:76] + TDPD[84:90] + TDPD[96:100]
PD = TDPD[4:8] + TDPD[14:20] + TDPD[28:36] + TDPD[44:52] + TDPD[60:68] + TDPD[76:84] + TDPD[90:96] + TDPD[100:]

In [12]:
if SITE == "OS":
    ORDER = ['T9_S21', 'T3_S21', 'N3_S21', 'N9_S21', 'T15_S15',
             'T9_S15', 'T3_S15', 'N3_S15', 'N9_S15', 'N15_S15',
             'T21_S9', 'T15_S9', 'T9_S9', 'T3_S9', 'N3_S9','N9_S9',
             'N15_S9', 'N21_S9', 'T21_S3', 'T15_S3', 'T9_S3','T3_S3',
             'N3_S3', 'N9_S3', 'N15_S3', 'N21_S3', 'N27_S3', 
             'T21_I3', 'T15_I3', 'T9_I3', 'T3_I3', 'N3_I3', 'N9_I3',
             'N15_I3', 'N21_I3', 'N27_I3', 'T21_I9', 'T15_I9',
             'T9_I9', 'T3_I9', 'N3_I9', 'N9_I9', 'N15_I9', 'N21_I9',
             'T15_I15', 'T9_I15', 'T3_I15', 'N3_I15', 'N9_I15',
             'N15_I15', 'T9_I21', 'T3_I21', 'N3_I21', 'N9_I21'
            ]
else:
    ORDER = ['N9_S21', 'N3_S21', 'T3_S21', 'T9_S21', 'N15_S15',
                 'N9_S15', 'N3_S15', 'T3_S15', 'T9_S15', 'T15_S15',
                 'N21_S9', 'N15_S9', 'N9_S9', 'N3_S9', 'T3_S9',
                 'T9_S9', 'T15_S9', 'T21_S9', 'N27_S3', 'N21_S3',
                 'N15_S3', 'N9_S3', 'N3_S3', 'T3_S3', 'T9_S3',
                 'T15_S3', 'T21_S3', 'N27_I3', 'N21_I3', 'N15_I3',
                 'N9_I3', 'N3_I3', 'T3_I3', 'T9_I3', 'T15_I3',
                 'T21_I3', 'N21_I9', 'N15_I9', 'N9_I9', 'N3_I9',
                 'T3_I9', 'T9_I9', 'T15_I9', 'T21_I9', 'N15_I15',
                 'N9_I15', 'N3_I15', 'T3_I15', 'T9_I15', 'T15_I15',
                 'N9_I21', 'N3_I21', 'T3_I21', 'T9_I21']
    
thr_order = [x + "_Thr" for x in ORDER]
td_order  = [x + "_TD" for x in ORDER]
pd_order  = [x + "_PD" for x in ORDER]

td_order.remove("T15_S3_TD")
td_order.remove("T15_I3_TD")

pd_order.remove("T15_S3_PD")
pd_order.remove("T15_I3_PD")

In [13]:
# Populate the data frame with thr points
output["SITE"] = SITE
output["NUM_THRESHOLD_POINTS"] = len(THR)

output[thr_order] = THR

# Populate the data frame with td points
output["NUM_TOTAL_DEV_VALUE_POINTS"] = len(TD)

output[td_order] = TD

# Populate the data frame with pd points
output["NUM_PATTERN_DEV_VALUE_POINTS"] = len(PD)

output[pd_order] = PD

In [14]:
# Populate the data frame with other acquired info
output["TEST_TYPE"]       = TEST_TYPE
output["TEST_PATTERN"]    = TEST_PATTERN
output["TRIAL_RX_SPHERE"] = TRIAL_RX_SPHERE
output["FIELD_SIZE"]      = FIELD_SIZE

output["INSTRUMENT_NAME"] = INSTRUMENT_NAME
output["INSTRUMENT_MODEL_NUMBER"] = INSTRUMENT_MODEL_NUMBER
output["INSTRUMENT_SERIAL_NUMBER"] = INSTRUMENT_SERIAL_NUMBER
output["INSTRUMENT_SOFTWARE_VERSION"] = INSTRUMENT_SOFTWARE_VERSION

#### Write the output to a csv

In [15]:
output.to_csv("test_24-2.csv", index=False)

#### Attempt to read the symbols

In [18]:
import PIL
import pytesseract


In [19]:
img = np.array(PIL.Image.open("/Users/nicolebrye/Desktop/out.png"))

In [39]:
crop1 = img[:50]

In [40]:
text = pytesseract.image_to_string(crop1)

In [52]:
a = pd.DataFrame(img[0])

In [56]:
any(a == 256)

True

In [62]:
all(a[3] == 255)

True