In [1]:
import re
import nltk

from sumy.nlp.tokenizers import Tokenizer

from milnlp.converters.pdf_to_text import PdfConverter, create_text, create_sumy_dom
from milnlp.converters.new_text_utils import RawTextProcessing
from milnlp.collection.utils.qol import doc_to_text

In [2]:
# filepath = "pdf_examples/normal_document.pdf"
# filepath = "pdf_examples/powerpoint.pdf"
# filepath = "pdf_examples/glossary.pdf" 
# filepath = "pdf_examples/challenge.pdf"
filepath = "pdf_examples/graphene.pdf"

converter = PdfConverter()
raw_document = converter.convert_pdf(filepath, to_file=False)

In [3]:
token = Tokenizer('English')

# Pre-process
document_sentences = RawTextProcessing.process_raw_into_lines(raw_document, token)

# Add to DOM (since that is how tool does it)
document = create_sumy_dom(document_sentences, token)

# Use doc_to_text
doc_text = doc_to_text(document)

In [4]:
document_sentences

['received: 01 June 2015 Accepted: 13 August 2015 Published: 24 September 2015',
 'Tunable mid-infrared coherent perfect absorption in a graphene meta-surface',
 'Yuancheng Fan1, Zhe Liu2, Fuli Zhang Qian Zhao Zeyong Wei4, Quanhong Fu1, Junjie Li2, Changzhi Gu2 & Hongqiang Li4',
 'Graphene has drawn considerable attention due to its intriguing properties in photonics and optoelectronics.',
 'However, its interaction with light is normally rather weak.',
 'Meta-surfaces, artificial structures with single planar function-layers, have demonstrated exotic performances in boosting light-matter interactions, e.g., for absorption enhancement.',
 'Graphene based high efficiency absorber is desirable for its potential applications in optical detections and signal modulations.',
 'Here we exploit graphene nanoribbons based meta-surface to realize coherent perfect absorption (CPA) in the mid-infrared regime.',
 'It was shown that quasi-CPA frequencies, at which CPA can be demonstrated with proper

## Testing

In [15]:
from collections import OrderedDict

from sumy.nlp.stemmers import Stemmer
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer

from milnlp.mining.phrases import extract_candidate_words, score_keyphrases_by_textrank
from milnlp.converters.pdf_to_text import create_sumy_dom

In [6]:
words = OrderedDict(score_keyphrases_by_textrank(doc_text, 22))
words

OrderedDict([('graphene', 0.03870796398622391),
             ('graphene meta-surface', 0.024831037105342717),
             ('graphene meta-surface cpa', 0.019527442530358066),
             ('meta-surface', 0.010954110224461524),
             ('absorption', 0.009027854783621742),
             ('cpa', 0.00892025338038876),
             ('optical absorption', 0.008899763358326042),
             ('optical', 0.008771671933030342),
             ('coherent absorption', 0.007791597246621432),
             ('tunable terahertz meta-surface', 0.007218875048829193),
             ('light', 0.006770993069882863),
             ('tunable', 0.006654943647240086),
             ('coherent', 0.006555339709621122),
             ('mid-infrared tunable optical', 0.006503987173691572),
             ('mid-infrared cpa', 0.006502799660596524),
             ('tunable terahertz optical', 0.006491395618352133),
             ('optical beams', 0.0064321726302814675),
             ('fig', 0.005904076568071254),
     

In [7]:
words = OrderedDict(score_keyphrases_by_textrank(doc_text, 50))
words

OrderedDict([('graphene', 0.03870796398622391),
             ('graphene meta-surface', 0.024831037105342717),
             ('graphene sheet', 0.02130032334704954),
             ('graphene meta-surfaces', 0.020864984532167976),
             ('graphene ’', 0.02078903261282892),
             ('periodic graphene', 0.02076944070411192),
             ('graphene nanoribbon', 0.02070201674222438),
             ('graphene meta-surface cpa', 0.019527442530358066),
             ('thin graphene meta-surface', 0.017626456755416676),
             ('graphene nanoribbon meta-surface', 0.01745271456963676),
             ('periodic graphene nanoribbon structures', 0.011980439718553846),
             ('meta-surface', 0.010954110224461524),
             ('absorption', 0.009027854783621742),
             ('cpa', 0.00892025338038876),
             ('optical absorption', 0.008899763358326042),
             ('optical', 0.008771671933030342),
             ('coherent absorption', 0.007791597246621432),
        

In [9]:
words = OrderedDict(score_keyphrases_by_textrank(doc_text, 100))
words

OrderedDict([('graphene', 0.03870796398622391),
             ('graphene meta-surface', 0.024831037105342717),
             ('graphene sheet', 0.02130032334704954),
             ('graphene meta-surfaces', 0.020864984532167976),
             ('graphene ’', 0.02078903261282892),
             ('graphene nanoribbon', 0.02070201674222438),
             ('two-dimensional graphene', 0.02049708817013562),
             ('graphene nanoribbons', 0.02042564068993582),
             ('monolayer graphene', 0.020366221080287336),
             ('graphene nanodisks', 0.020303173655868843),
             ('graphene meta-surface cpa', 0.019527442530358066),
             ('thin graphene meta-surface', 0.017626456755416676),
             ('graphene nanoribbon meta-surface', 0.01745271456963676),
             ('monolayer graphene sheet', 0.014875041622816615),
             ('graphene nanoribbon meta-atoms', 0.014605064316684775),
             ('periodic graphene nanodisks', 0.014479088244579205),
             

In [8]:
# The scores don't change, but there are candidates that did not exist in the smaller number??

## Validating stability of TextRank scoring

In [18]:
import itertools, nltk, string
from itertools import takewhile, tee
import networkx

text = doc_text
n_keywords = 10

words = [word.lower()
         for sent in nltk.sent_tokenize(text)
         for word in nltk.word_tokenize(sent)]
candidates = extract_candidate_words(text)
# build graph, each node is a unique candidate
graph = networkx.Graph()
graph.add_nodes_from(set(candidates))

# iterate over word-pairs, add unweighted edges into graph
def pairwise(iterable):
    """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

for w1, w2 in pairwise(candidates):
    if w2:
        graph.add_edge(*sorted([w1, w2]))

# score nodes using default pagerank algorithm, sort by score, keep top n_keywords
# todo stability is broken. returns incorrect top scorers if n_keywords < # items
ranks = networkx.pagerank(graph)
if 0 < n_keywords < 1:
    n_keywords = int(round(len(candidates) * n_keywords))
word_ranks = {word_rank[0]: word_rank[1]
              for word_rank in sorted(ranks.items(), key=lambda x: x[1], reverse=True)[:n_keywords]}
keywords = set(word_ranks.keys())

In [19]:
ranks

{'magnetic': 0.0019002072171262626,
 'transmission': 0.001142546981762006,
 'array': 0.00120982083068151,
 'resonators': 0.0008755694364506184,
 'yan': 0.0006165731619410799,
 'cpa': 0.00892025338038876,
 'stack': 0.001117901704684865,
 'e.g.': 0.0008226144366074865,
 'channel': 0.0008057510519501663,
 'competing': 0.0011447618171299078,
 'junjie': 0.0014320121315170872,
 'pt': 0.0008448009345631362,
 'propagation': 0.0008522448388764705,
 'recent': 0.0009624326236099329,
 'financial': 0.00186891587922855,
 'discrete': 0.0008492982626334407,
 'terahertz': 0.004047571274785968,
 'special': 0.0009592800366228335,
 'unit': 0.0010199488928561262,
 'vapour': 0.001052419309481886,
 'functionality': 0.0009293969063612164,
 'b': 0.001372717567458763,
 'nonlinear': 0.0010019676173012577,
 'boundary': 0.0017148985769888366,
 'electrostatic': 0.0008893428106178885,
 'nanostructures': 0.00089696745239014,
 'exotic': 0.0011209828024214895,
 'quencies': 0.0008463441282761095,
 'ministry': 0.00117856

In [20]:
word_ranks

{'graphene': 0.03870796398622391,
 'meta-surface': 0.010954110224461524,
 'absorption': 0.009027854783621742,
 'cpa': 0.00892025338038876,
 'optical': 0.008771671933030342,
 'light': 0.006770993069882863,
 'tunable': 0.006654943647240086,
 'coherent': 0.006555339709621122,
 'fig': 0.005904076568071254,
 'metamaterials': 0.005287006070407155}

In [23]:
sorted(ranks.values(), reverse=True)[0:10]

[0.03870796398622391,
 0.010954110224461524,
 0.009027854783621742,
 0.00892025338038876,
 0.008771671933030342,
 0.006770993069882863,
 0.006654943647240086,
 0.006555339709621122,
 0.005904076568071254,
 0.005287006070407155]