### Line-by-Line Extraction with AWS Textract

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import boto3
import pandas as pd
from PIL import Image, ImageDraw

In [36]:
keys = pd.read_csv('/Users/yiminglin/Documents/Codebase/credentials/textract_accessKeys.csv')

In [6]:
client = boto3.client('textract',
                      region_name='us-west-1',
                      aws_access_key_id=keys.iloc[0]['Access key ID'],
                      aws_secret_access_key=keys.iloc[0]['Secret access key']
                     )

In [7]:
def get_img(file_path):
    return bytearray(open(file_path, 'rb').read())

In [8]:
def get_text_from_path(file_path):
    img = get_img(file_path)
    return client.detect_document_text(
        Document={'Bytes':img}
    )

In [22]:
def get_lines(image, blocks):
    # Returns all blocks that are lines within a scanned text object.

    lines = []
    width, height = image.width, image.height
    for block in blocks:
        if block['BlockType'] != 'LINE':
            continue
        coords = []
        for coord_map in block['Geometry']['Polygon']:
            coords.append([coord_map['X']*width, coord_map['Y']*height])
        coords = coords[0] + coords[2]
        lines.append([block['Text'], coords])

    return lines

def get_doc_lines(doc_path, num_pages):
    # Returns all blocks that are lines within a the scanned text objects of a PDF file path.

    doc_lines = []
    for page in range(num_pages):
        file_path = doc_path + str(page)+'.jpg'
        print(file_path)
        spec_image = Image.open(file_path)
        text = get_text_from_path(file_path)
        lines = get_lines(spec_image, text['Blocks'])
        lines = [[page+1]+line for line in lines]
        doc_lines += lines

    return [[doc_line[0], doc_line[1]] + doc_line[2] for doc_line in doc_lines]

In [24]:
from pdf2image import convert_from_path
def pdf_2_image(path, page_num, out_folder):
    images = convert_from_path(path, first_page = 1, last_page = page_num)
    for i in range(page_num):
        out_path = out_folder + str(i) + '.jpg'
        images[i] = images[i].save(out_path)
    return images



In [33]:
import os

def create_folder(folder_path): 

    # Check if the folder exists, if not, create it
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")


In [28]:
file_path = '/Users/yiminglin/Documents/Codebase/Pdf_reverse/data/raw/complaints & use of force/UIUC PD Use of Force/22-274.releasable.pdf'
file_name = '22-274-releasable'
number_of_pages = 2
out_folder_path = 'images/' + file_name + '/'
#pdf_2_image(file_path,number_of_pages,out_folder_path)
doc_lines = get_doc_lines(out_folder_path, number_of_pages)

images/22-274-releasable/0.jpg
images/22-274-releasable/1.jpg


In [29]:
doc_lines_df = pd.DataFrame(doc_lines, columns=['Page', 'Phrase', 'x1', 'y1', 'x2', 'y2'])
doc_lines_df.to_csv(file_name+'.csv')
doc_lines_df

Unnamed: 0,Page,Phrase,x1,y1,x2,y2
0,1,Report Criteria: Incidents Between: 1/1/2021 A...,102.745377,64.590409,744.831992,93.214014
1,1,Officer Detail Reports #A-12,101.123903,109.581639,460.924653,138.448624
2,1,University of Illinois Police Department,905.795936,105.985259,1287.557272,128.671549
3,1,Page 1 of 27,1951.695256,108.225834,2080.546490,132.466597
4,1,Use Of Force Full Details,770.516073,145.599498,1420.933767,188.797024
...,...,...,...,...,...,...
124,2,Included On Alert: Yes,1842.086684,844.710926,2078.593598,864.656128
125,2,Narrative: Officers responded to 35 E. Green t...,101.521851,890.217033,2066.269888,915.693278
126,2,a felony stop. Occupants of the vehilce were o...,228.044121,918.455783,1744.922647,943.502856
127,2,L.E.A. Data Technologies ADMINISTRATIVE Databa...,101.450550,1599.914634,998.461539,1623.478306


In [32]:
file_path = '/Users/yiminglin/Documents/Codebase/Pdf_reverse/data/raw/certification/CT/DecertifiedOfficersRev_9622 Emilie Munson.pdf'
file_name = 'Munson'
number_of_pages = 2
out_folder_path = 'images/' + file_name + '/'
create_folder(out_folder_path)
pdf_2_image(file_path,number_of_pages,out_folder_path)
doc_lines = get_doc_lines(out_folder_path, number_of_pages)
doc_lines_df = pd.DataFrame(doc_lines, columns=['Page', 'Phrase', 'x1', 'y1', 'x2', 'y2'])
doc_lines_df.to_csv(file_name+'.csv')
doc_lines_df

images/Munson/0.jpg
images/Munson/1.jpg


Unnamed: 0,Page,Phrase,x1,y1,x2,y2
0,1,LIST OF DECERTIFIED OFFICERS BY,866.398007,68.040251,1333.327603,91.540628
1,1,POLICE OFFICER STANDARDS AND TRAINING COUNCIL,725.428760,112.070008,1473.556948,135.244154
2,1,AND PREDECESSOR,965.933579,155.434877,1232.383430,178.153415
3,1,MUNICIPAL POLICE TRAINING COUNCIL,828.594673,198.248383,1371.728563,221.427393
4,1,YEAR,83.302242,326.838626,150.116825,347.958881
...,...,...,...,...,...,...
219,2,3841,1086.051548,1508.781797,1140.282416,1528.423095
220,2,9/5/2013,1274.698102,1507.317305,1379.332018,1531.779379
221,2,Making False Statements,1657.960129,1508.472037,1938.230121,1532.880509
222,2,Any police officer whose certification is canc...,49.574283,1574.699849,2142.182982,1600.527114


In [34]:
file_path = '/Users/yiminglin/Documents/Codebase/Pdf_reverse/data/raw/certification/VT/Invisible Institue Report.pdf'
file_name = 'Invisible'
number_of_pages = 2
out_folder_path = 'images/' + file_name + '/'
create_folder(out_folder_path)
pdf_2_image(file_path,number_of_pages,out_folder_path)
doc_lines = get_doc_lines(out_folder_path, number_of_pages)
doc_lines_df = pd.DataFrame(doc_lines, columns=['Page', 'Phrase', 'x1', 'y1', 'x2', 'y2'])
doc_lines_df.to_csv(file_name+'.csv')
doc_lines_df

Folder 'images/Invisible/' created successfully.
images/Invisible/0.jpg
images/Invisible/1.jpg


Unnamed: 0,Page,Phrase,x1,y1,x2,y2
0,1,"AUG 19,2019",140.644336,115.026231,308.355582,140.449348
1,1,VT Criminal Justice Training Council,556.880283,112.756785,1137.407154,145.492928
2,1,Page: 1,1412.336791,115.316914,1529.738533,143.018369
3,1,07:41AM,140.980352,156.657650,250.595362,178.754665
4,1,Employee History,710.502696,154.773341,989.977294,188.383442
...,...,...,...,...,...,...
118,2,Pos/Rank: Deputy,917.338669,579.752070,1153.798664,607.032335
119,2,Level:,243.406528,615.695715,320.364609,637.510735
120,2,Class: Part-Time,969.726914,614.550883,1187.047082,637.263227
121,2,Shift:,252.347018,650.204337,320.909855,672.631210
