In [1]:
from milnlp.converters.pdf_to_text import create_text
from milnlp.tokenizers import Tokenizer
from milnlp.converters.text_utils import *

In [2]:
document_text = create_text(r".\example.pdf", Tokenizer("english"), to_file=True)
print(document_text)

Using default NLTK tokenizer not english. Custom language tokenizers not available.
2


In [3]:
lines = process_lines(document_text)
print("Original number of lines: ", len(lines))
lines = remove_short_lines(lines)
print(f"Number of lines with at least 5 chars: ", len(lines))
lines = merge_likely_sentences(lines)
print("Number of lines after merging: ", len(lines))                
lines = split_likely_sentences(lines)
print("Number of lines after splitting: ", len(lines))
lines = remove_unlikely_sentences(lines)
print("Trimmed number of lines: ", len(lines))
lines = remove_nonascii(lines)

Original number of lines:  74
Number of lines with at least 5 chars:  70
Number of lines after merging:  41
Number of lines after splitting:  54
Trimmed number of lines:  37


In [4]:
print('\n'.join(lines))

UAV and aircraft 3D passive tracking
3D TDOA for accurate tracking and geolocation of UAVs and  aircraft over wide areas
TDOA is a well proven technique for 2D geolocation which can be used to provide highly accurate tracking  of RF-emitting targets over wide areas.
The increasingly three-dimensional nature of threats has prompted  a need for a 3D geolocation solution.
The use of UAVs around sites such as airports and secure government  compounds, and the spoofing of aircraft ADSB transmissions, are just two examples of such threats..
In response, CRFS has developed an enhanced TDOA technology that can provide accurate geolocation  of emitters in three dimensions using a ground-based network of nodes.
This network can track multiple  ground-based and airborne transmitters, including aircraft and UAVs, in real time to provide latitude,  longitude, altitude and speed data..
Our 3D TDOA solution also delivers excellent accuracy; we have demonstrated tracking of aircraft with errors  as lo

In [5]:
# Example of adding to a sumyplus document object model
from sumy.models.dom import Sentence, Paragraph, ObjectDocumentModel
from sumy.nlp.tokenizers import Tokenizer

paragraphs = []
paragraph = []
for ii, line in enumerate(lines):
    if line[0] != ' ' and ii>0:  # Last line was the last one in paragraph
        paragraphs.append(Paragraph(paragraph))  # Dump paragraph
        paragraph = [] # start new paragraph going forward
    # Process current line
    paragraph.append(Sentence(line, Tokenizer("english")))
    
document = ObjectDocumentModel(tuple(paragraphs))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zwelz3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
document

<DOM with 27 paragraphs>

In [7]:
# Example of processing the document words into key phrases
print(len(document.words))
document.words

492


('UAV',
 'and',
 'aircraft',
 'passive',
 'tracking',
 'TDOA',
 'for',
 'accurate',
 'tracking',
 'and',
 'geolocation',
 'of',
 'UAVs',
 'and',
 'aircraft',
 'over',
 'wide',
 'areas',
 'TDOA',
 'is',
 'a',
 'well',
 'proven',
 'technique',
 'for',
 'geolocation',
 'which',
 'can',
 'be',
 'used',
 'to',
 'provide',
 'highly',
 'accurate',
 'tracking',
 'of',
 'targets',
 'over',
 'wide',
 'areas',
 'The',
 'increasingly',
 'nature',
 'of',
 'threats',
 'has',
 'prompted',
 'a',
 'need',
 'for',
 'a',
 'geolocation',
 'solution',
 'The',
 'use',
 'of',
 'UAVs',
 'around',
 'sites',
 'such',
 'as',
 'airports',
 'and',
 'secure',
 'government',
 'compounds',
 'and',
 'the',
 'spoofing',
 'of',
 'aircraft',
 'ADSB',
 'transmissions',
 'are',
 'just',
 'two',
 'examples',
 'of',
 'such',
 'In',
 'response',
 'CRFS',
 'has',
 'developed',
 'an',
 'enhanced',
 'TDOA',
 'technology',
 'that',
 'can',
 'provide',
 'accurate',
 'geolocation',
 'of',
 'emitters',
 'in',
 'three',
 'dimensions'

## Add Key-Phrase Extraction

In [35]:
from milnlp.mining.phrases import extract_candidate_words, score_keyphrases_by_textrank

In [36]:
print("Sentence: ", document.sentences[0]._text)
candidates = extract_candidate_words(document.sentences[0]._text)
print(f"Using example sentence, {len(candidates)} unicode candidates were extracted from the text.")
print(candidates)

Sentence:  UAV and aircraft 3D passive tracking
Using example sentence, 4 unicode candidates were extracted from the text.
['uav', 'aircraft', 'passive', 'tracking']


In [49]:
doc_text = ' '.join([sentence._text for sentence in document.sentences])
candidates = set(extract_candidate_words(doc_text))
print(f"Using textrank, {len(candidates)} unicode candidates were extracted from the text.")

Using textrank, 166 unicode candidates were extracted from the text.


In [54]:
results = score_keyphrases_by_textrank(doc_text, n_keywords=0.6)  # if n < 1 it returns the percentage of results
len(results)

131

In [56]:
dict(results)

{'tdoa': 0.024064045283170334,
 'rfeye': 0.023401737076219318,
 'rfeye nodes': 0.022487303818679853,
 'nodes': 0.02157287056114039,
 'geolocation': 0.018163297195970013,
 'tdoa network': 0.0175653170650293,
 'tdoa software': 0.017318017662085565,
 'aircraft': 0.016473469884194545,
 'rfeye site': 0.015897960163413466,
 'rfeye range': 0.01575783673108655,
 'tdoa solution': 0.015648716661895965,
 'crfs': 0.015579667373858092,
 'rfeye node': 0.015418078161838454,
 'many nodes': 0.014254322296785384,
 'tdoa results': 0.014183123958024887,
 'enhanced tdoa': 0.01404229892858654,
 'spectrum monitoring': 0.013505087392309153,
 'monitoring': 0.013479379188470529,
 'outdoor kit rfeye nodes': 0.01333398685687006,
 'geolocation solution': 0.012698342618295805,
 'emitter geolocation': 0.012668740416176168,
 'accurate geolocation': 0.01253341271883765,
 'cambridge': 0.0125139763738359,
 'wide areas tdoa': 0.012088588025002038,
 'state-of-the-art rfeye site software': 0.01162981275216654,
 'real-time 