In [1]:
import os
import re
import sys
import glob
import math
import logging
from pathlib import Path
from pprint import pprint

import numpy as np
import scipy as sp
import sklearn

import spacy
import tika
from tika import parser

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from wb_nlp.processing import document

In [3]:
## Hints

# nlp = spacy.load('en_core_web_sm')

This notebook contains examples of how the `PDFDoc2Txt` class can be used to convert pdf documents into formatted text. Additional methods implemented in this class can also be applied to raw texts extracted from PDFs.

We start by creating an instance of the `PDFDoc2Txt`—`pdf_parser`.

In [4]:
pdf_parser = document.PDFDoc2Txt()

# Parsing a pdf file

Parsing a pdf file starts with the `parse` method. This method accepts a buffer of byte object or a string to a url or file path. The source type must be specified for the parser to correctly execute the processing.

In [5]:
??pdf_parser.parse

[0;31mSignature:[0m [0mpdf_parser[0m[0;34m.[0m[0mparse[0m[0;34m([0m[0msource[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbytes[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m,[0m [0msource_type[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'buffer'[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mparse[0m[0;34m([0m[0mself[0m[0;34m,[0m [0msource[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbytes[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m,[0m [0msource_type[0m[0;34m:[0m [0mstr[0m[0;34m=[0m[0;34m'buffer'[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Parse a PDF document to text from different source types.[0m
[0;34m[0m
[0;34m        Args:[0m
[0;34m            source:[0m
[0;34m                Source of the PDF that needs to be converted.[0m
[0;34m                The source could be a url, a path, or a buffer/file-like object[0m
[0;

## Processing a single page

In [6]:
??pdf_parser.process_page

[0;31mSignature:[0m [0mpdf_parser[0m[0;34m.[0m[0mprocess_page[0m[0;34m([0m[0mpage[0m[0;34m:[0m [0mbs4[0m[0;34m.[0m[0melement[0m[0;34m.[0m[0mTag[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
    [0;34m@[0m[0mstaticmethod[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0mprocess_page[0m[0;34m([0m[0mpage[0m[0;34m:[0m [0mbs4[0m[0;34m.[0m[0melement[0m[0;34m.[0m[0mTag[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mparagraphs[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m        [0;32mfor[0m [0mp[0m [0;32min[0m [0mpage[0m[0;34m.[0m[0mfind_all[0m[0;34m([0m[0;34m'p'[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0mparagraph[0m [0;34m=[0m [0mPDFDoc2Txt[0m[0;34m.[0m[0mconsolidate_paragraph[0m[0;34m([0m[0mp[0m[0;34m.[0m[0mtext[0m[0;34m)[0m[0;34m

# Paragraph consolidation algorithm

In [7]:
??pdf_parser.consolidate_paragraph

[0;31mSignature:[0m
[0mpdf_parser[0m[0;34m.[0m[0mconsolidate_paragraph[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtext_paragraph[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_fragment_len[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;34m@[0m[0mstaticmethod[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0mconsolidate_paragraph[0m[0;34m([0m[0mtext_paragraph[0m[0;34m:[0m [0mstr[0m[0;34m,[0m [0mmin_fragment_len[0m[0;34m:[0m [0mint[0m[0;34m=[0m[0;36m3[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Consolidate a `text_paragraph` with possible multiple newlines into one logical paragraph.[0m
[0;34m[0m
[0;34m        Tika provides access to extracted text by paragraph. These paragraphs, however, may contain[0m
[0;34m        multiple newlines that br