In [1]:
import re
import nltk

from sumy.nlp.tokenizers import Tokenizer

from milnlp.converters.pdf_to_text import PdfConverter, create_text, create_sumy_dom
from milnlp.converters.new_text_utils import RawTextProcessing
from milnlp.collection.utils.qol import doc_to_text

In [2]:
filepath = "pdf_examples/normal_document.pdf"
# filepath = "pdf_examples/powerpoint.pdf"
# filepath = "pdf_examples/glossary.pdf" 
# filepath = "pdf_examples/challenge.pdf"
# filepath = "pdf_examples/graphene.pdf"

converter = PdfConverter()
raw_document = converter.convert_pdf(filepath, to_file=False)
tokenizer = Tokenizer('English')

In [17]:
import re
import nltk

# Regex Patterns
page_pattern = re.compile(r"\f")
unicode_pattern = re.compile(r"[^\x00-\x7F]")
wordref_pattern = re.compile(r"([a-z]{3,})[\[]?[\d]+[,-]?[\d]*[\]]?")  # https://regexr.com/4213c

# Sets for lookup
normal_chars = set([char for char in ":,.?!-\"\'\(\)"])
normal_punct = set([char for char in ".?!\"\'\)"])
basic_pos = {'JJ','JJR','JJS','NN','NNP','NNS'}  # these POS alone are not enough for a valid "natural language" sentence

TUNABLE_MAX_NUPW = 0.05  # the maximum number of unicode symbols per words (freq) for a sentence before the sentence is excluded
TUNABLE_MIN_SPC = 2  # the minimum number of sentences per 1 chunk before the chunk is discarded.
TUNABLE_MAX_CPU = 2  # the maximum number of expected characters in any unit (cm^3 = 2) used to discard reference indices (something23)
TUNABLE_MIN_WPS = 6  # the minimum number of nltk tokenized 'words' in a sentence before the sentence is thrown out
TUNABLE_MAX_WPS = 50  # the maximum number of nltk tokenized 'words' in a sentence before the sentence is thrown out

raw_text = raw_document

pages = page_pattern.finditer(raw_text)
all_sentences = []
processed_pages = []
start = 0
# Break document into pages
for pi, page in enumerate(pages):
    end, new_start = page.span()
    page_text = raw_text[start:end]
    start = new_start

    # Break page into chunks
    processed_chunks = []
    chunks = page_text.split('\n\n')
    for ci, chunk in enumerate(chunks):
        # Remove newlines from chunk
        chunk_text = chunk.replace('\n', ' ').replace('  ',
                                                      ' ')  # todo this is merging headers into sentences. Replace with intelligent check
                                                            # ex: normal_document

        # Break chunk into sentences
        processed_sentences = []
        for sentence in nltk.sent_tokenize(chunk_text):

            # Add steps here to clean up words+ref (i.e. parnel28)
            replacement_map = [(match.group(0), match.group(1)) for match in wordref_pattern.finditer(sentence)]
            for before, after in replacement_map:
                sentence = sentence.replace(before, after)

            # Get words and POS-tagged words for next steps
            words = nltk.word_tokenize(sentence)
            num_valid_words = sum([tokenizer._is_word(word) for word in words])
            pos_words = nltk.pos_tag(
                words)  # todo '|' char is being tagged as word instead of symbol (i.e. JJ, NN)
                        # todo likely a unicode error (nltk not trained on unicode for english?)

            # Test number of unicode symbols in the sentence
            nu = len(unicode_pattern.findall(sentence))  # nu = number of unicode chars
            nupw = nu / len(words)
            # Test number of NN+JJ to make sure sentence is 'natural'
            explanatory_pos = set([pos for _, pos in pos_words]).difference(normal_chars.union(basic_pos))

            # Add to sentences if passing checks
            if nupw <= TUNABLE_MAX_NUPW and \
                TUNABLE_MAX_WPS > num_valid_words >= TUNABLE_MIN_WPS and \
                explanatory_pos:
                if sentence.strip()[-1] not in normal_punct:
                    sentence += '.'
                processed_sentences.append(sentence)
                all_sentences.append(sentence)

        # Add to chunks if passing checks
        if len(processed_sentences) >= TUNABLE_MIN_SPC:
            processed_chunks.append(processed_sentences)

    processed_pages.append(processed_chunks)
    
document_sentences = all_sentences

In [18]:
# Add to DOM (since that is how tool does it)
document = create_sumy_dom(document_sentences, tokenizer)

# Use doc_to_text
doc_text = doc_to_text(document)

## Testing

In [19]:
document_sentences

['MOTIVATION The goal is to provide decision makers with an agile and defensible approach to making system acquisition and system employment decisions.',
 'Primarily to allow them to tradeoff potential EW concepts and identify S&T investment strategies.',
 'This proposed implementation is intended to provide decision makers with an ability to interactively assess the value (from a functional and capability standpoint) of aggregations of EW systems and suites for various airframes.',
 'The notional example presented is based on a rapid prototype developed by GTRI to illustrate how such a process could be implemented quickly as an Excel tradeoff tool to evaluate various EW suites for a USAF combat aircraft.',
 'The process and tool could be extended to allow for the comparison of multiple types of aircraft flying independent and joint missions.',
 'The process is intended to integrate the world-view and concerns from two groups of stakeholders, namely (1) the domain subject matter expert