# Extract, transfer, and load raw PDF data

This notebook seeks to replicate the functionality of the R package `retrieveR` in a Python 3 environment. It performs about 80% (subjectively) as well as the R counterpart, with additional casing needed for certain types of documents.

**NOTE**: Use R counterpart for now. This is not production ready, and other sections of pipeline use `retrieveR`.

Last Updated: July 23, 2019

In [None]:
%%javascript
Jupyter.notebook.session.restart({kernel_name: 'policy-toolkit'})

In [387]:
from PIL import Image
from wand.image import Image as wimage
import os
import io
import pytesseract
import numpy as np
import re
from itertools import islice
from functools import reduce
import string

filepath = "../data/raw/Forest Conservation and Management Act.pdf"

class Document:
    def __init__(self, path, titles = None):
        self.path = path
        self.name = self.path
    
    def get_images(self):
        page_images = []
        with wimage(filename=self.path, resolution=200) as img:
            for page_wand_image_seq in img.sequence:
                page_wand_image = wimage(page_wand_image_seq)
                page_jpeg_bytes = page_wand_image.make_blob(format="jpeg")
                page_jpeg_data = io.BytesIO(page_jpeg_bytes)
                page_image = Image.open(page_jpeg_data)
                page_images.append(page_image)
        self.images = page_images
        
    def process_images(self):
        self.text = np.array([pytesseract.image_to_string(x) for x in self.images])
        self.pages = [x for x in range(len(self.images))]
        
    def split_lines(self):
        self.lines = {}
        counter = 0
        pages = [re.split('\n', self.text[i]) for i in self.pages]
        for x, page in enumerate(pages):
            for i, line in enumerate(page):
                self.lines[i + counter] = [line, x]
            counter += len(page)
    
    def remove_titles(self):
        titles = [x for x in list(self.lines.keys()) if self.lines[x][0].lower() != self.lines[x][0] and\
                  len(self.lines[x][0]) <= 50]
        titles = [x for x in titles if self.lines[max(0, x - 1)][0].endswith('.') or\
                  self.lines[max(0, x - 1)][0] == '']
        print("Removing {} titles".format(len(titles)))
        for x in titles:
            del self.lines[x]
        self.lines = { key : value for key, value in zip(range(len(self.lines)), self.lines.values())}
        
    def combine_sentences(self):
        self.paragraphs = []
        ends = [0] + [i + 1 for i in self.lines.keys() if self.lines.get(i)[0].endswith('.')]
        sentence_ids = [(val, ends[x + 1]) for x, val in enumerate(ends) if val < max(ends)]
        for i in sentence_ids:
            lines = [self.lines.get(x)[0] for x in range(i[0], i[1])]
            self.paragraphs.append(' '.join(' '.join(lines).split()))
            
    def remove_headers(self):
        def _window(seq, n):
            "Returns a sliding window (of width n) over data from the iterable"
            "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
            it = iter(seq)
            result = tuple(islice(it, n))
            if len(result) == n:
                yield result
            for elem in it:
                result = result[1:] + (elem,)
                yield result


        words = [x.replace(".pdf", "") for x in self.path.replace("/", " ").split(" ") if x not in ["..", "data", "raw"]]

        locs = []
        for i in reversed(range(3, len(words) + 1)):
            for x in window(words, n = i):
                locs.append(' '.join(x))

        self.clean = [reduce(lambda item, loc: item.replace(loc,''), [item]+locs)
            for item in self.paragraphs]
        
    def clean_data(self):
        self.clean = [re.sub("[\(\[].*?[\)\]]", "", x).lower() for x in self.clean]
        self.clean = [x.translate(str.maketrans('', '', string.punctuation)) for x in self.clean]
        
    def export_data(self):
        self.export_path = self.path.replace("raw", "processed")
        self.export_path = self.export_path.replace(".pdf", ".txt")
        with open(self.export_path, "w") as text_file:
            for x in self.clean:
                text_file.write(x)
                text_file.write('\n')

In [388]:
filepaths = os.listdir("../data/raw/")
filepaths = [x for x in filepaths if x[-4:] == ".pdf"]
filepaths

['DRAFT ASAL POLICY.pdf',
 'Charcoal Rules.pdf',
 'National Land Policy.pdf',
 'Constitution.pdf',
 'Agriculture Rules.pdf',
 'Community Land Act.pdf',
 'National Climate Responses Strategy.pdf',
 'Agriculture Sectoral Development Strategy.pdf',
 'Forest Conservation and Management Act.pdf',
 'Vision 2030.pdf',
 'Fifth Report to Conference of Parties.pdf',
 'The Environmental Management and Coordination Act.pdf']

In [None]:
for i in filepaths:
    print(i)
    doc1 = Document(path = "../data/raw/" + i)
    doc1.get_images()
    doc1.process_images()
    print("Images processed")
    doc1.split_lines()
    doc1.remove_titles()
    doc1.combine_sentences()
    print("Sentences combined")
    doc1.remove_headers()
    doc1.clean_data()
    print("Text cleaned")
    doc1.export_data()
    print("Exported data to: {}".format(doc1.export_path))