In [1]:
import boto3
from collections import Counter
import re
import datetime as dt

In [15]:
# Start up a client instance for textract
client = boto3.client('textract')
# All the sample pathways to test SN generation
paths = ['catalina-4-19-21.jpg', 'catalina-10-5-21.jpg', 'indoor-5-17-21.jpg', 'mini-3-5-2021.jpg',
         'mini-4-22-20.jpg', 'mini-5-31-21.jpg', 'mini-10-5-21.jpg', 'mini-12-21-20.jpg', 'mini-26-4-21.jpg']

In [3]:
# Function to extract text from an image
def analyzeDocument(path):
    with open(path, 'rb') as raw_image: # Open the image from path
        tempImage = raw_image.read() # Read in the image
        byteImage = bytearray(tempImage) # Convert to a byte array
    response = client.detect_document_text(Document = {'Bytes': byteImage}) # Call text detection method
    return response


In [16]:
# Keep only the tabular data
'''
Intuition for the filter
- We know that the SN has to be of a sufficient length (with some buffer in case some characters are filtered out)
- Some queries will include excessive spaces (we know that in theory the SN should be one big blob)
    - Can remove any queries that don't have any digits or alphabetical characters since SN should contain both
'''

# Helper method to make the batch number based off of the text info
def getBatchNumber(tableData):
    formattedDate = '' # Get the formatted date to create the batch number
    lineVal = '' # Get the line value from the textract text blob
    try:
        dateVal = tableData[tableData.index('Date:')+1]
        if '/' in dateVal:
            try:
                formattedDate = dt.datetime.strptime(dateVal, '%m/%d/%Y').strftime('%Y%m%d')
            except Exception as e:
                print('error with date formatting:', e)
        elif '-' in dateVal:
            try:
                formattedDate = dt.datetime.strptime(dateVal, '%Y-%m-%d').strftime('%Y%m%d')
            except Exception as e:
                print('error with date formatting:', e)
        else:
            formattedDate = ''
    except Exception as e:
        pass
        
    try:
        lineVal = tableData[tableData.index('Line:')+1]
    except Exception as e:
        pass
    batchNumber = 'NA' + formattedDate + '-' + lineVal
    return batchNumber

# Helper method to get the device type
def getDeviceType(tableData):
    for s in tableData:
        if 'Blink' in s:
            return s
    return 'Device type not found'

# Helper method to find the "majority" ruling
def mostCommonElement(lst):
    temp = Counter(lst).most_common(1) # Get the most frequent element + its count
    return temp[0][0] # Return the most frequently appearing element

# Entries that are deemed suspicious
def flagEntries(lst):
    prefixes = [s[:7] for s in lst] # Get the first 7 characters
    lengths = [len(s) for s in lst] # Get the length of the sequence
    isDuplicate = [lst.count(s) > 1 for s in lst] # Check to see if they're are duplicates
    allowedChars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-') # Only possible characters in SN
    containsIrreg = [bool(re.search('[^A-za-z0-9-]', s)) for s in lst] # Look for suspicious characters
    mcp = mostCommonElement(prefixes) # Get most common prefix
    mcl = mostCommonElement(lengths) # Get most commonly appearing length
    suspicious = [prefixes[i] != mcp or lengths[i] != mcl or isDuplicate[i] or containsIrreg[i] for i in range(len(lst))] # Corresponds to suspicious entries
    return suspicious

def testRelevant():
    allFormatted = []
    cleaned = []
    allText = []
    batchNumbers = []
    deviceType = []
    for path in paths: # Iterate across all paths
        results = analyzeDocument(path) # Analyze the given image
        blocks = results['Blocks'] # Extract only relevant information
        tableData = [x['Text'] for x in blocks if x['BlockType'] == 'LINE'] # Only keep data that is a "line"
        cleanedData = [] # Use a list b/c algorithm can produce duplicates
        for t in tableData:
            if len(t) > 8 and any(c.isdigit() for c in t) and any(c.isalpha() for c in t) and 'Date' not in t and t.count(' ') <= 2:
                cleanedData.append(t) # Only include non-filtered out data
        batchNumber = getBatchNumber(tableData)
        deviceType = getDeviceType(tableData)
        suspicion = flagEntries(cleanedData)
        #return (allText, cleaned)
        formattedDict = {
            'SerialNumbers': cleanedData,
            'All Text': tableData,
            'Suspicion': suspicion,
            'Batch Number': batchNumber,
            'Device Type': deviceType
        }
        allFormatted.append(formattedDict)
    return allFormatted

allDicts = testRelevant()
for d in allDicts:
    for key in d:
        print(key + ':', d[key], '\n')
    print('\n\n\n')
# print(serialNums[0]) # Sanity Check
    

SerialNumbers: ['G8T1JX00114208BW', 'G8T1JX00114208C4', '8T1JX00114208C5', 'G8T1JX00114208C2', 'G8T1JX00114209JA', 'G8T1JX00114208BF', 'G8T1JX00114208BH', 'G8T1JX00114208BR', 'G8T1JX00114208C6', 'G8T1JX00114208BV', 'G8T1JX00114208BT', 'G8T1JX00114208BD'] 

All Text: ['OR', 'Date:', '4/19/2021', 'Line:', 'A17', 'Shift:', 'Day', 'Blink Catalina Camera US', 'No', 'SN', 'No', 'SN', '1', 'G8T1JX00114208BW', '7', 'G8T1JX00114208C4', '2', '8T1JX00114208C5', '8', 'G8T1JX00114208C2', '3', 'G8T1JX00114209JA', '9', 'G8T1JX00114208BF', '4', 'G8T1JX00114208BH', '10', 'G8T1JX00114208BR', '5', 'G8T1JX00114208C6', '11', 'G8T1JX00114208BV', '6', 'G8T1JX00114208BT', '12', 'G8T1JX00114208BD'] 

Suspicion: [False, False, True, False, False, False, False, False, False, False, False, False] 

Batch Number: NA20210419-A17 

Device Type: Blink Catalina Camera US 





SerialNumbers: ['G8V1GH001095MP09', 'G8V1GH001095MPOA', 'G8V1GH001095MPOR', 'G8V1GH001095MNXU', 'G8V1GH001095MP04', 'G8V1GH001095MPOE', 'G8V1GH

In [38]:
# Check to see what the "sloppy" data looks like
for rawText in allText:
    print(rawText, '\n\n')

['OR', 'Date:', '4/19/2021', 'Line:', 'A17', 'Shift:', 'Day', 'Blink Catalina Camera US', 'No', 'SN', 'No', 'SN', '1', 'G8T1JX00114208BW', '7', 'G8T1JX00114208C4', '2', '8811XX00114208C5', '8', 'G8T1JX00114208C2', '3', 'G8T1JX00114209JA', '9', 'G8T1JX00114208BF', '4', 'G8T1JX00114208BH', '10', 'G8T1JX00114208BR', '5', 'G8T1JX00114208C6', '11', 'G8T1JX00114208BV', '6', 'G8T1JX00114208BT', '12', 'G8T1JX00114208BD'] 


['Date:', '10/5/2021', 'Line:', '3', 'Shift:', 'Day', 'Blink Catalina', 'No', 'SN', 'No', 'SN', 'No', 'SN', 'x', 'G8V1GH001095MP09', 't', 'G8V1GH001095MPOA', 'X', 'G8V1GH001095MPOR', 'G8V1GH001095MNXU', 'G8V1GH001095MP04', 'A', 'G8V1GH001095MPOE', 'G8V1GH001095MP07', 'G8V1GH001095MPOB', 'X', 'G8V1GH001095MP08', 'h', 'G8V1GH001095MNXX', '&', 'G8V1GH001095MP05', 'X', 'G8V1GH001095MNXT'] 


['T', 'Date: 2021-05-17 Line: A9 Shift:', 'Day', 'NA Indoor Camcra', 'No', 'SN', 'No', 'SN', '1', 'G8T1JX0011733TT0', '7', 'G8T1JX0011733TSM', '2', 'G8T1JX0011733TSK', '8', 'G8T1JX0011733TS

In [13]:
# Alternate method that uses the documnet analysis method (for "table" data)
def extractText(path):
    with open(path, 'rb') as raw_image:
        tempImage = raw_image.read()
        byteImage = bytearray(tempImage)
    response = client.analyze_document(Document = {'Bytes': byteImage}, FeatureTypes = ['TABLES'])
    return response

In [15]:
extractText(paths[0])

{'DocumentMetadata': {'Pages': 1},
 'Blocks': [{'BlockType': 'PAGE',
   'Geometry': {'BoundingBox': {'Width': 0.7846524119377136,
     'Height': 0.49072569608688354,
     'Left': 0.0723995491862297,
     'Top': 0.1815500408411026},
    'Polygon': [{'X': 0.0723995491862297, 'Y': 0.1994531750679016},
     {'X': 0.8570519685745239, 'Y': 0.1815500408411026},
     {'X': 0.8486262559890747, 'Y': 0.672275722026825},
     {'X': 0.08544255048036575, 'Y': 0.6709693670272827}]},
   'Id': '5c22d99d-3307-43e0-866e-3888fbdf011f',
   'Relationships': [{'Type': 'CHILD',
     'Ids': ['dceffc17-4bbb-46e8-a3f9-8e9173f7ba2e',
      '7a00a641-1313-4f3f-a6be-d25963048b9e',
      '8c883c07-25d5-4d93-ba97-0d5dbfb7d1d2',
      '6c2f0898-b389-4467-ab93-bdbeb9045f1b',
      'a230fcf6-c821-4fa2-aeee-39a82a3c2724',
      'af74a08f-ecd4-4b5e-8263-38df773d50a4',
      '99a6014c-74fe-4990-a5b9-e4bef1e67541',
      '21ef0744-5dff-4883-9d2a-ebe23b0bd954',
      '86bf2c69-85ea-49e8-bf8b-31282b4a93e2',
      'a83b639e-92