In [1]:
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lsa import LsaSummarizer as Summarizer  # use this summarizer for naive metadata summary
#
from milnlp.tokenizers import Tokenizer
from milnlp.collection.collection import Collection
from milnlp.collection.collection import get_items

LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
token = Tokenizer(LANGUAGE)
#
term_frequency_threshold = 0.5  
num_sentences = 8  # for composite summary

Using default NLTK tokenizer not english. Custom language tokenizers not available.


In [2]:
import os
import re

from itertools import chain

In [3]:
collection_path = r"C:\Users\zwelz3\Documents\GTRI_Projects\ECCT_EW_EMS\Market Research"
query_path = r"C:\Users\zwelz3\Documents\GTRI_Projects\ECCT_EW_EMS\Market Research\Concepts, Processes, Approaches\Autonomous Ops\Adaptive Cognitive EW"

cobj = Collection(query_path)
cobj.flist, cobj.dlist = get_items(query_path, set(), {query_path: 0})

-> Discovered file 'AFRL RY Meeting, Cognitive Adaptive EW, 18 Oct 17.pdf'
-> Discovered file 'AFRL RY Meeting, Cognitive Adaptive EW, 18 Oct 17.txt'
-> Discovered file 'Basic Cognitive EW Concepts.pdf'
-> Discovered file 'Basic Cognitive EW Concepts.txt'
-> Discovered file 'Cognitive Electronic Warfare, Adaptive Radars.pdf'
-> Discovered file 'Cognitive Electronic Warfare, Adaptive Radars.txt'
-> Discovered file 'Cognitive EW - Countering Threats Posed By Adaptive Radars.pdf'
-> Discovered file 'Cognitive EW - Countering Threats Posed By Adaptive Radars.txt'
-> Discovered file 'Cognitive EW Explanation.pdf'
-> Discovered file 'Cognitive EW Explanation.txt'
-> Discovered file 'Difference Between Adaptive and Cognitive EW.pdf'
-> Discovered file 'Difference Between Adaptive and Cognitive EW.txt'
-> Discovered file 'Handheld Cognitive EW Technology (BAE).pdf'
-> Discovered file 'Handheld Cognitive EW Technology (BAE).txt'
-> Discovered file 'Lt Col Corbell's Cognitive EW Definition.pdf'


In [4]:
assert collection_path in query_path, ""
query_branch = query_path[len(collection_path):].split('\\')
query_branch = list(filter(None, query_branch))

print(f"Number of levels in query path: {len(query_branch)}")
level_thresh = -1
levels = query_branch[level_thresh:]
print(f"Number of descriptive levels used to build query: {len(levels)}")
print("-> ", levels)

Number of levels in query path: 3
Number of descriptive levels used to build query: 1
->  ['Adaptive Cognitive EW']


In [5]:
raw_query_string = ' '.join([level for level in levels])

In [6]:
example_raw_query_string = 'Autonomous Ops Adaptive Cognitive EW (Electronic, Warfare), Capability (Upgradeability - OSA), F-35'
example_raw_query_string = re.sub('[(),]', '', example_raw_query_string)
example_raw_query_string = re.sub(' - ', ' ', example_raw_query_string)
example_raw_query_string

'Autonomous Ops Adaptive Cognitive EW Electronic Warfare Capability Upgradeability OSA F-35'

In [7]:
raw_query_string = re.sub('[(),]', '', raw_query_string)
raw_query_string = re.sub(' - ', ' ', raw_query_string)
raw_query_string

'Adaptive Cognitive EW'

In [8]:
query_words = raw_query_string.split(' ')
query = [(f"[\W]{word}[\W]",) for word in query_words]
query

[('[\\W]Adaptive[\\W]',), ('[\\W]Cognitive[\\W]',), ('[\\W]EW[\\W]',)]

In [9]:
results = cobj.make_query(query)

Performing union query...
Done!


In [10]:
unique_docs1 = set()
for term in results[0].keys():
    print("The term", term, "shows up in", len(results[0][term].keys()), "documents.")
    unique_docs1 = unique_docs1.union(set(list(results[0][term].keys())))

print(f"\nThe unioned terms show up in {len(unique_docs1)} unique documents.")
unique_docs1

The term [\W]Adaptive[\W] shows up in 6 documents.
The term [\W]Cognitive[\W] shows up in 9 documents.
The term [\W]EW[\W] shows up in 9 documents.

The unioned terms show up in 9 unique documents.


{'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research\\Concepts, Processes, Approaches\\Autonomous Ops\\Adaptive Cognitive EW\\AFRL RY Meeting, Cognitive Adaptive EW, 18 Oct 17.txt',
 'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research\\Concepts, Processes, Approaches\\Autonomous Ops\\Adaptive Cognitive EW\\Basic Cognitive EW Concepts.txt',
 'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research\\Concepts, Processes, Approaches\\Autonomous Ops\\Adaptive Cognitive EW\\Cognitive EW - Countering Threats Posed By Adaptive Radars.txt',
 'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research\\Concepts, Processes, Approaches\\Autonomous Ops\\Adaptive Cognitive EW\\Cognitive EW Explanation.txt',
 'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research\\Concepts, Processes, Approaches\\Autonomous Ops\\Adaptive Cognitive EW\\Cognitive Electronic Warfare, Adaptive Radars.txt',
 'C:\\Users\\zwel

In [11]:
composite_doc = cobj.create_composite_document(query, token, method='full')
summary = cobj.summarize_composite(composite_doc, summarizer, 4)
for sentence in summary:
    print('* ', str(sentence))
    
    
print('\n\n')

composite_doc = cobj.create_composite_document(query, token, method='reduced')
summary = cobj.summarize_composite(composite_doc, summarizer, 4)
for sentence in summary:
    print('* ', str(sentence))

Performing union query...
Done!
Creating a composite document using the full constituent documents...
 - Reading in file 1/9
 - Reading in file 2/9
 - Reading in file 3/9
 - Reading in file 4/9
 - Reading in file 5/9
 - Reading in file 6/9
 - Reading in file 7/9
 - Reading in file 8/9
 - Reading in file 9/9
Done!
*  ?0/Scalable, Multi-Protocol ConnectivityCompact Rugged Avionics Interface ComputerScalable, Multi-Protocol Connectivity High Density Computing & Connectivity Intel Atom Architecture  Expanded Scalable Capabilities USB 3.0 SupportSWaP-C Optimized System Rugged Deployable Compact Enclosure Low Power Computing Performance  MIL-STD-810G Shock, Vibration & Immersion / MIL-STD-461F EMIMulti-Protocol Flexibility Ethernet, MIL-STD-1553, ARINC 429/717, CANbus 2.0/ARINC 825,     RS-232/422/485 & Avionics/Digital Discrete I/O 3 modes (Remote Access, Protocol Conversion & Standalone) Expandable: (2) Mini-PCIe sites & (1) I/O Expansion Module53years of serviceTo learn more, visit  a t a

### Make summary without query (using passed collection path)

In [12]:
composite_doc = cobj.create_composite_document(None, token)
composite_doc

Sub-collection:  C:\Users\zwelz3\Documents\GTRI_Projects\ECCT_EW_EMS\Market Research\Concepts, Processes, Approaches\Autonomous Ops\Adaptive Cognitive EW
Creating a composite document using all files in sub-collection...
 - Reading in file 1/9
 - Reading in file 2/9
 - Reading in file 3/9
 - Reading in file 4/9
 - Reading in file 5/9
 - Reading in file 6/9
 - Reading in file 7/9
 - Reading in file 8/9
 - Reading in file 9/9
Done!


<DOM with 153 paragraphs>

In [13]:
summary = cobj.summarize_composite(composite_doc, summarizer, 4)
for sentence in summary:
    print('* ', str(sentence))

*  ?0/Scalable, Multi-Protocol ConnectivityCompact Rugged Avionics Interface ComputerScalable, Multi-Protocol Connectivity High Density Computing & Connectivity Intel Atom Architecture  Expanded Scalable Capabilities USB 3.0 SupportSWaP-C Optimized System Rugged Deployable Compact Enclosure Low Power Computing Performance  MIL-STD-810G Shock, Vibration & Immersion / MIL-STD-461F EMIMulti-Protocol Flexibility Ethernet, MIL-STD-1553, ARINC 429/717, CANbus 2.0/ARINC 825,     RS-232/422/485 & Avionics/Digital Discrete I/O 3 modes (Remote Access, Protocol Conversion & Standalone) Expandable: (2) Mini-PCIe sites & (1) I/O Expansion Module53years of serviceTo learn more, visit  a t a   D e v i c e  c o r p o r a t i o noutside  prior  known  bounds    todays systems have very limited ability to char-acterize  the  threat  or  turn  around  an appropriate response.
*  Future cognitive EWInterestingly, in terms of the hardware involved, Lockheed Martin is finding that a commercial off-the-shelf