In [1]:
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer  # use this summarizer for naive metadata summary
#
from milnlp.collection.collection import Collection
from milnlp.tokenizers import Tokenizer

LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
token = Tokenizer(LANGUAGE)
#
term_frequency_threshold = 0.5  
num_sentences = 8  # for composite summary
#
collectionpath = r"C:\Users\zwelz3\Documents\GTRI_Projects\ECCT_EW_EMS\Market Research Sample"

Using default NLTK tokenizer not english. Custom language tokenizers not available.


In [2]:
cobj = Collection(collectionpath)
# at this point, the collection object is just a path container
cobj.process_collection()
# at this point, the collection object contains a list of all files in the collection (full paths) 
#    and a list of directories at their respective levels (i.e. 0 == top level) 
cobj.generate_metadata(summarizer, token, term_frequency_threshold)
# at this point, all the (super)metadata files are created and populated for the collection

Processing collection at path ...\Market Research Sample: 


Discovered directory 'Congested Spectrum'
-> Discovered file 'Cognitive Radio.pdf'
-> Discovered file 'Cognitive Radio.txt'
-> Discovered file 'DARPA opens Shared Spectrum Access for Radar and Communications.pdf'
-> Discovered file 'DARPA opens Shared Spectrum Access for Radar and Communications.txt'
-> Discovered file 'DYSE.pdf'
-> Discovered file 'DYSE.txt'
-> Discovered file 'Electromagnetic spectrum sharing.pdf'
-> Discovered file 'Electromagnetic spectrum sharing.txt'
-> Discovered file 'ESCE for DEWSAR Feb 2018 v2 (FOUO).pdf'
-> Discovered file 'ESCE for DEWSAR Feb 2018 v2 (FOUO).txt'
-> Discovered file 'ESCE SOO.pdf'
-> Discovered file 'ESCE SOO.txt'
-> Discovered file 'SAIJ.pdf'
-> Discovered file 'SAIJ.txt'
-> Discovered file 'Summary - DARPA's Shared Spectrum Access for Radar and Communications.pdf'
-> Discovered file 'Summary - DARPA's Shared Spectrum Access for Radar and Communications.txt'

Discovered directory '

In [3]:
# Make queries independent of the summary, returns the results and the method (union or intersect)
cobj.make_query([("hypersonic",),("hypervelocity",)])

Performing union query...
Done!


({'hypersonic': {'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research Sample\\Hypersonic Weapons\\DARPA Chief-National Hypersonics Needed - summary n source.txt': {1,
    2,
    3},
   'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research Sample\\Hypersonic Weapons\\How Hypersonic Weapons Could Completely Change Face of War.txt': {1,
    2,
    3,
    4,
    5},
   'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research Sample\\Hypersonic Weapons\\Lockheed Martin tasked with developing hypersonic missiles.txt': {1,
    2}},
  'hypervelocity': {'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research Sample\\Hypersonic Weapons\\SCO aims to transition hypervelocity weapon next year.txt': {1,
    2},
   'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research Sample\\Hypersonic Weapons\\Army creating new POR for hypervelocity projectile.txt': {1}}},
 'union')

In [4]:
#query = [("hypersonic",),("hypervelocity",)]  # example union query
query = [[("hypersonic",),("hypervelocity",)],[('American',)]]  # example intersect query
#query = [("predictable",)]  # example reduced query

### Create composite document with specified resolution (i.e. document or page)

In [5]:
cobj.make_query(query)

Performing intersect query...
Number of files matching all queries up to and including query #1:  5
Number of files matching all queries up to and including query #2:  2
Done!


({'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research Sample\\Hypersonic Weapons\\Lockheed Martin tasked with developing hypersonic missiles.txt': {1,
   2},
  'C:\\Users\\zwelz3\\Documents\\GTRI_Projects\\ECCT_EW_EMS\\Market Research Sample\\Hypersonic Weapons\\How Hypersonic Weapons Could Completely Change Face of War.txt': {1,
   2,
   4}},
 'intersect')

In [6]:
method = "full"  # reduced=page appearance only, full=entire document
composite_doc_full = cobj.create_composite_document(query, token, method=method)
print(f"Using method '{method}' results in: ", composite_doc_full)

print('')

method = "reduced"  # reduced=page appearance only, full=entire document
composite_doc_reduced = cobj.create_composite_document(query, token, method=method)
print(f"Using method '{method}' results in: ", composite_doc_reduced)

Performing intersect query...
Number of files matching all queries up to and including query #1:  5
Number of files matching all queries up to and including query #2:  2
Done!
Creating a composite document using the full constituent documents...
 - Reading in file 1/2
 - Reading in file 2/2
Done!
Using method 'full' results in:  <DOM with 119 paragraphs>

Performing intersect query...
Number of files matching all queries up to and including query #1:  5
Number of files matching all queries up to and including query #2:  2
Done!
Creating a composite document using only the relevant pages from constituent documents...
 - Reading in file 1/2
  -> Matching pages: {1, 2}
 - Reading in file 2/2
  -> Matching pages: {1, 2, 4}
Done!
Using method 'reduced' results in:  <DOM with 84 paragraphs>


### Create summary with different composite documents

In [7]:
import time

In [8]:
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer  # use this summarizer for naive metadata summary
summarizer = Summarizer(stemmer)

st = time.time()
full_summary = cobj.summarize_composite(composite_doc_full, summarizer, num_sentences)
et = time.time()-st
print("Time to generate summary of full composite document: ", et)
for sentence in full_summary:
    print('* ', sentence._text)

print('')

st = time.time()
reduced_summary = cobj.summarize_composite(composite_doc_reduced, summarizer, num_sentences)
et = time.time()-st
print("Time to generate summary of reduced composite document: ", et)
for sentence in reduced_summary:
    print('* ', sentence._text)

Time to generate summary of full composite document:  0.45433950424194336
*  While the United States has worked in the past with Australia on experimental hypersonic missile  platforms, it seems only now, as the threat posed by the nations competitors is already at hand, that  the Defense Department is ready to seriously pursue the development of Americas own  technological response.
*  This is the  potential threat posed by so-called hypersonic weapons, which can fly as fast as a mile a  second and low enough to evade many existing defenses
*  On May 30, 2017, the U.S.
*  But more importantly, this air-breathing engine generates a very different signature from a rocket  motor, meaning space-based surveillance assets might not be able to spot one as quickly or keep  tracking it during flight, or even spot it at all for that matter.
*  All of these features have made the concept attractive within the Pentagon, as potential  opponents field increasingly more powerful radars, surface-to-a

In [9]:
from sumy.summarizers.lsa import LsaSummarizer as Summarizer  # use this summarizer for naive metadata summary
summarizer = Summarizer(stemmer)

st = time.time()
full_summary = cobj.summarize_composite(composite_doc_full, summarizer, num_sentences)
et = time.time()-st
print("Time to generate summary of full composite document: ", et)
for sentence in full_summary:
    print('* ', sentence._text)

print('')

st = time.time()
reduced_summary = cobj.summarize_composite(composite_doc_reduced, summarizer, num_sentences)
et = time.time()-st
print("Time to generate summary of reduced composite document: ", et)
for sentence in reduced_summary:
    print('* ', sentence._text)

Time to generate summary of full composite document:  0.19649434089660645
*  Though there is much attention given to the expanding threat of ballistic missiles, especially  from countries such as Iran and North Korea, American military officials are increasingly  concerned about hypersonic weapons.
*  As the name suggests,  these missiles largely follow a ballistic arc, boosting very high up before falling back down on  their target.
*  Hypersonic weapons could be the key to  breaking through these protective layers, including knocking down an enemy's integrated air
*  Air Force launched Boeing's X-51 Waverider experimental  hypersonic aircraft from a decidedly dated and non-stealthy B-52H bomber.
*  XS-1,  or  more  likely  a  production  craft  that  springs  from  it,  couldn't  be  used  for  deploying  suborbital hypersonic payloads, possibly on short notice.
*  We estimate it will cost less than $1 billion to develop, build  and fly a demonstrator aircraft the size of an F-22.
* 

In [10]:
from sumy.summarizers.edmundson_cue import EdmundsonCueMethod as Summarizer
bonus_words = set()
stigma_words = set()  # I do not want passive (i.e. active)
summarizer = Summarizer(stemmer,bonus_words,stigma_words)


st = time.time()
full_summary = cobj.summarize_composite(composite_doc_full, summarizer, num_sentences, 5, -1)
et = time.time()-st
print("Time to generate summary of full composite document: ", et)
for sentence in full_summary:
    print('* ', sentence._text)

print('')

st = time.time()
reduced_summary = cobj.summarize_composite(composite_doc_reduced, summarizer, num_sentences, 5, -1)
et = time.time()-st
print("Time to generate summary of reduced composite document: ", et)
for sentence in reduced_summary:
    print('* ', sentence._text)

Time to generate summary of full composite document:  0.03788948059082031
*  Lockheed Martin tasked with billion dollar initiative to  develop American hypersonic missiles
*  By Alex Hollings 04.25.2018   https://sofrep.com/102418/lockheed-martin-tasked-with-billion-dollar-initiative-to-develop- american-hypersonic-missiles/
*  Last month, Russian President Vladimir Putin delivered a national address that included talk about a  number of new missile platforms the Russian military is rapidly fielding.
*  For defense analysts, none  of the missiles he discussed were particularly new, as most were actually leftover Cold War era  projects the Kremlin only recently put into production  but one type of missile technology was  different.
*  While many of the platforms Putin discussed offered strategic novelty moreso than  functionality, hypersonic missile platforms really do pose a threat to Americas defensive apparatus
*  Russia isnt the only nation working to field missiles that can exceed 