### Normally, the extract_candidate_words function is only used on pre-processed text, not raw text.

In [41]:
from sumy.nlp.tokenizers import Tokenizer

from milnlp.converters.pdf_to_text import PdfConverter
from milnlp.converters.text_utils import process_raw_into_lines
from milnlp.converters.pdf_to_text import create_sumy_dom
from milnlp.mining.phrases import extract_candidate_words, score_keyphrases_by_textrank

In [35]:
# Convert pdf to raw text
converter = PdfConverter()
text = converter.convert_pdf(
    r"C:\Users\zwelz3\Documents\GTRI_Projects\ECCT_EW_EMS\Market Research\Concepts, Processes, Approaches\Aircraft Design\Materials\2) Graphene Meta-Material Absorber.pdf")

In [37]:
# convert raw text to pre-processed text
document_text = process_raw_into_lines(text)

In [44]:
# convert pre-processed text into a sumy document object
token = Tokenizer('english')
document = create_sumy_dom(document_text, token)
doc_text = ' '.join([sentence._text for sentence in document.sentences])

In [46]:
# extract candidates from document text
candidates = extract_candidate_words(doc_text)

In [47]:
# no multi-word candidates, these are created in the scoring algorithm.
for poi in {"nanoribbon","nanoribbons","graphene nanoribbon","graphene nanoribbons"}:
    try:
        assert poi in candidates
    except AssertionError:
        print(poi, "not found")

graphene nanoribbon not found
graphene nanoribbons not found


In [59]:
sis = []
for si, sentence in enumerate(document.sentences):
    #print(sentence._text, '\n')
    if "graphene nanoribbon" in sentence._text:
        sis.append(si)
        
print(f"The POI showed up {len(sis)} times even though it was not selected as a candidate.")

The POI showed up 17 times even though it was not selected as a candidate.


### Deconstruct the extraction process

In [19]:
import itertools, nltk, string
from itertools import takewhile, tee
import networkx

In [55]:
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize and POS-tag words
tagged_sentences = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))

### looking at the following two cells, it looks like superscripts/subscripts end up smashed into the word they are next to, which causes nasty candidates to pass through. 

In [81]:
for ss, sent in enumerate(nltk.sent_tokenize(text)):
    if ss == 25:
        break

sent

'mode of the nanodisks together with multi-reflection from the assistants of total internal reflection and \nmetal reflection can result in a complete optical absorption28.'

In [79]:
tagged_sentences[25]

[('mode', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('nanodisks', 'NNS'),
 ('together', 'RB'),
 ('with', 'IN'),
 ('multi-reflection', 'NN'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('assistants', 'NNS'),
 ('of', 'IN'),
 ('total', 'JJ'),
 ('internal', 'JJ'),
 ('reflection', 'NN'),
 ('and', 'CC'),
 ('metal', 'JJ'),
 ('reflection', 'NN'),
 ('can', 'MD'),
 ('result', 'VB'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('complete', 'JJ'),
 ('optical', 'JJ'),
 ('absorption28', 'NN'),
 ('.', '.')]

In [67]:
sis

[4, 7, 31, 36, 37, 40, 45, 46, 49, 61, 64, 84, 85, 90, 94, 95, 99]

### There are also a significant number of broken, single letter candidates that should be removed. Better pre-processing should alleviate these cases, but additional removal may be needed from the raw candidates. 

## Testing how nltk tokenizer tags POS for sentence with broken elements

In [90]:
text = """
A big dog jumped.
A big A dog jumped.
A big a dog jumped. 
A big dog A jumped.
A big dog a jumped.
A dog A jumped. 
A dog a jumped. 
"""
tok_sent = nltk.pos_tag_sents(nltk.word_tokenize(sent) for ss, sent in enumerate(nltk.sent_tokenize(text)))

In [91]:
for ii, sent in enumerate(tok_sent):
    print(f"{ii}.", sent)

0. [('A', 'DT'), ('big', 'JJ'), ('dog', 'NN'), ('jumped', 'VBD'), ('.', '.')]
1. [('A', 'DT'), ('big', 'JJ'), ('A', 'NN'), ('dog', 'NN'), ('jumped', 'NN'), ('.', '.')]
2. [('A', 'DT'), ('big', 'JJ'), ('a', 'DT'), ('dog', 'NN'), ('jumped', 'NN'), ('.', '.')]
3. [('A', 'DT'), ('big', 'JJ'), ('dog', 'NN'), ('A', 'DT'), ('jumped', 'NN'), ('.', '.')]
4. [('A', 'DT'), ('big', 'JJ'), ('dog', 'NN'), ('a', 'DT'), ('jumped', 'NN'), ('.', '.')]
5. [('A', 'DT'), ('dog', 'NN'), ('A', 'NNP'), ('jumped', 'NN'), ('.', '.')]
6. [('A', 'DT'), ('dog', 'NN'), ('a', 'DT'), ('jumped', 'NN'), ('.', '.')]


Note the different behavior depending on where the broken text shows up and the case of the text. 
0. normal sentence
1. UC injected between JJ and NN -> NN
2. LC injected between JJ and NN -> DT
3. UC injected between JJ+NN and VBD -> DT (VBD now NN)
4. LC injected between JJ+NN and VBD -> DT (VBD now NN)
5. UC injected between NN and VBD -> NNP (VBD now NN)
6. LC injected between NN and VBD -> DT (VBD now NN)

Especially note cases 3-6 where there is now no verb in the sentence because of the corrupt text. This causes issues later on when TextRank attempts to score sentences. 

*Food-for-thought. If it gets this messed up with a single broken character, the behavior is unpredictable for n-number of broken characters and even worse for multiple broken characters in adjacent sentences that are also broken (i.e. assuming the sentence tokenizer fails).*

# Testing additional sentences to see how they get tokenized and tagged

In [97]:
text = """
Yuancheng Fan1, Zhe Liu2, Fuli Zhang1, Qian Zhao3, Zeyong Wei4, Quanhong Fu1, Junjie Li2,  Changzhi Gu2 & Hongqiang Li4

The graphene layer is considered as a sheet material modeled with complex surface conductivity (σg) 
since a one-atom-thick graphene sheet is sufficiently thin compared with the concerned wavelength. In 
the theoretical perspective based on random-phase-approximation (RPA)54–56, the complex conductivity 
1 , especially in heavily 
)−
of graphene can be described by the Drude model as 
doped  region  and  low  frequencies  (far  below  Fermi  energy),  where  EF  represents  the  Fermi  energy, 
2 is the relaxation rate with the mobility μ =  104 cm2V−1s−1 and Fermi velocity vF ≈  106 m/s.
µ
=
τ
F

The  complex  scattering  coefficients  (O±)  of  the  graphene  nanoribbon  meta-surface  can  be  related 
to the two input beams (I±, and in this paper the two input beams are set to be of equal amplitude I) 
through a scattering matrix, Sg, defined as:

"""
tok_sent = nltk.pos_tag_sents(nltk.word_tokenize(sent) for ss, sent in enumerate(nltk.sent_tokenize(text)))

In [99]:
for ii, sent in enumerate(tok_sent):
    print(f"{ii}.", sent, '\n')

0. [('Yuancheng', 'NNP'), ('Fan1', 'NNP'), (',', ','), ('Zhe', 'NNP'), ('Liu2', 'NNP'), (',', ','), ('Fuli', 'NNP'), ('Zhang1', 'NNP'), (',', ','), ('Qian', 'NNP'), ('Zhao3', 'NNP'), (',', ','), ('Zeyong', 'NNP'), ('Wei4', 'NNP'), (',', ','), ('Quanhong', 'NNP'), ('Fu1', 'NNP'), (',', ','), ('Junjie', 'NNP'), ('Li2', 'NNP'), (',', ','), ('Changzhi', 'NNP'), ('Gu2', 'NNP'), ('&', 'CC'), ('Hongqiang', 'NNP'), ('Li4', 'NNP'), ('The', 'DT'), ('graphene', 'NN'), ('layer', 'NN'), ('is', 'VBZ'), ('considered', 'VBN'), ('as', 'IN'), ('a', 'DT'), ('sheet', 'NN'), ('material', 'NN'), ('modeled', 'VBN'), ('with', 'IN'), ('complex', 'JJ'), ('surface', 'NN'), ('conductivity', 'NN'), ('(', '('), ('σg', 'NN'), (')', ')'), ('since', 'IN'), ('a', 'DT'), ('one-atom-thick', 'JJ'), ('graphene', 'NN'), ('sheet', 'NN'), ('is', 'VBZ'), ('sufficiently', 'RB'), ('thin', 'JJ'), ('compared', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('concerned', 'JJ'), ('wavelength', 'NN'), ('.', '.')] 

1. [('In', 'IN'), ('the', 

In [104]:
text = "µ = τ^34 + 10"
tok_sent = nltk.pos_tag_sents(nltk.word_tokenize(sent) for ss, sent in enumerate(nltk.sent_tokenize(text)))
for ii, sent in enumerate(tok_sent):
    print(f"{ii}.", sent, '\n')

0. [('µ', 'JJ'), ('=', 'NNP'), ('τ^34', 'NNP'), ('+', 'VBD'), ('10', 'CD')] 



Notes:
- because the sentence tokenizer cannot divide sentences properly, the POS tagger gets confused and improperly tags

- first, we must pre-process the text to ensure that sentences are tokenized correctly (different branch).
  > then merge into this branch
- second, we need to remove sentences that do not meet some criteria (i.e. there is not enough normal language structure)
  > this would get rid of sentences that are created from: labels (figure, section), lists (bullets, pptx), references, glossary/index, author lists, etc.