In [None]:
%%javascript
Jupyter.notebook.session.restart({kernel_name: 'policy-toolkit'})

In [320]:
from PIL import Image
from wand.image import Image as wimage
import os
import io
import pytesseract
import numpy as np
import re

filepath = "../data/raw/Forest Conservation and Management Act.pdf"

class Document:
    def __init__(self, path, titles = None):
        self.path = path
        self.titles = titles
    
    def get_images(self):
        page_images = []
        with wimage(filename=self.path, resolution=200) as img:
            for page_wand_image_seq in img.sequence:
                page_wand_image = wimage(page_wand_image_seq)
                page_jpeg_bytes = page_wand_image.make_blob(format="jpeg")
                page_jpeg_data = io.BytesIO(page_jpeg_bytes)
                page_image = Image.open(page_jpeg_data)
                page_images.append(page_image)
        self.images = page_images
        
    def process_images(self):
        self.text = np.array([pytesseract.image_to_string(x) for x in self.images])
        self.pages = [x for x in range(len(self.images))]
        
    def split_lines(self):
        self.lines = {}
        counter = 0
        pages = [re.split('\n', self.text[i]) for i in self.pages]
        for x, page in enumerate(pages):
            for i, line in enumerate(page):
                self.lines[i + counter] = [line, x]
            counter += len(page)
    
    def remove_titles(self):
        titles = [x for x in list(self.lines.keys()) if self.lines[x][0].lower() != self.lines[x][0] and\
                  len(self.lines[x][0]) <= 50]
        titles = [x for x in titles if self.lines[max(0, x - 1)][0].endswith('.') or\
                  self.lines[max(0, x - 1)][0] == '']
        print("Removing {} titles".format(len(titles)))
        for x in titles:
            del self.lines[x]
        self.lines = { key : value for key, value in zip(range(len(self.lines)), self.lines.values())}
        
    def combine_sentences(self):
        self.paragraphs = []
        ends = [0] + [i + 1 for i in self.lines.keys() if self.lines.get(i)[0].endswith('.')]
        sentence_ids = [(val, ends[x + 1]) for x, val in enumerate(ends) if val < max(ends)]
        for i in sentence_ids:
            lines = [self.lines.get(x)[0] for x in range(i[0], i[1])]
            self.paragraphs.append(' '.join(' '.join(lines).split()))

In [321]:
doc1 = Document(path = filepath)
doc1.get_images()
doc1.process_images()
doc1.split_lines()
doc1.remove_titles()
doc1.combine_sentences()
#! TODO: Identify and remove page headers and footers
#! TODO: pass over the paragraphs and join danglers
#! TODO: context-aware spell correction
#! TODO: Remove [], ()
#! TODO: Export as CSV :)

Removing 447 titles


In [None]:
for x in doc1.paragraphs:
    print(x, '\n')