In [109]:
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumyplus.parsers.parser import DocumentParser
from sumyplus.parsers.html import HtmlParser
from sumyplus.parsers.plaintext import PlaintextParser
from sumyplus.nlp.tokenizers import Tokenizer
from sumyplus.summarizers.lsa import LsaSummarizer as Summarizer
from sumyplus.nlp.stemmers import Stemmer
from sumyplus.utils import get_stop_words

In [2]:
LANGUAGE = "english"
SENTENCES_COUNT = 5

url = "https://en.wikipedia.org/wiki/Semantic_Web"
parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

# or for plain text files
# parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))

stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence, '\n')

Compared to the public Semantic Web there are lesser requirements on scalability and the information circulating within a company can be more trusted in general; privacy is less of an issue outside of handling of customer data. 

Critics question the basic feasibility of a complete or even partial fulfillment of the Semantic Web, pointing out both difficulties in setting it up and a lack of general-purpose usefulness that prevents the required effort from being invested. 

[…] cost-benefit tradeoffs can work in favor of specially-created Semantic Web metadata directed at weaving together sensible well-structured domain-specific information resources; close attention to user/customer needs will drive these federations if they are to be successful. 

Another argument in defense of the feasibility of semantic web is the likely falling price of human intelligence tasks in digital labor markets, such as Amazon 's Mechanical Turk . 

The GRDDL (Gleaning Resource Descriptions from Dialects of

In [3]:
from sumyplus.utils import get_stop_words

In [4]:
parser._article.main_text

[(('The', None),
  ('Semantic Web', ('b',)),
  ('is an extension of the', None),
  ('World Wide Web', ('a',)),
  ('through standards by the', None),
  ('World Wide Web Consortium', ('a',)),
  ('(W3C).', None),
  ('[1]', ('a', 'sup')),
  ('The standards promote common data formats and exchange protocols on the Web, most fundamentally the',
   None),
  ('Resource Description Framework', ('a',)),
  ('(RDF). According to the W3C, "The Semantic Web provides a common framework that allows data to be shared and reused across application, enterprise, and community boundaries".',
   None),
  ('[2]', ('a', 'sup')),
  ('The Semantic Web is therefore regarded as an integrator across different content, information applications and systems.',
   None)),
 (('The term was coined by', None),
  ('Tim Berners-Lee', ('a',)),
  ('for a web of data (or', None),
  ('data web', ('b',)),
  (')', None),
  ('[3]', ('a', 'sup')),
  ('that can be processed by machines', None),
  ('[4]', ('a', 'sup')),
  ('—that is

In [5]:
for paragraph in parser._article.main_text:
    break

In [6]:
paragraph

(('The', None),
 ('Semantic Web', ('b',)),
 ('is an extension of the', None),
 ('World Wide Web', ('a',)),
 ('through standards by the', None),
 ('World Wide Web Consortium', ('a',)),
 ('(W3C).', None),
 ('[1]', ('a', 'sup')),
 ('The standards promote common data formats and exchange protocols on the Web, most fundamentally the',
  None),
 ('Resource Description Framework', ('a',)),
 ('(RDF). According to the W3C, "The Semantic Web provides a common framework that allows data to be shared and reused across application, enterprise, and community boundaries".',
  None),
 ('[2]', ('a', 'sup')),
 ('The Semantic Web is therefore regarded as an integrator across different content, information applications and systems.',
  None))

In [7]:
# parser contains the ODM (parser.document) which is all that is needed to run summarizer
# Summarizer can be instantiated once and used for multiple different ODMs

In [8]:
parser.document

<DOM with 57 paragraphs>

In [121]:
parser._article.main_text[0]

(('The', None),
 ('Semantic Web', ('b',)),
 ('is an extension of the', None),
 ('World Wide Web', ('a',)),
 ('through standards by the', None),
 ('World Wide Web Consortium', ('a',)),
 ('(W3C).', None),
 ('[1]', ('a', 'sup')),
 ('The standards promote common data formats and exchange protocols on the Web, most fundamentally the',
  None),
 ('Resource Description Framework', ('a',)),
 ('(RDF). According to the W3C, "The Semantic Web provides a common framework that allows data to be shared and reused across application, enterprise, and community boundaries".',
  None),
 ('[2]', ('a', 'sup')),
 ('The Semantic Web is therefore regarded as an integrator across different content, information applications and systems.',
  None))

In [130]:
parser.document.paragraphs[0].sentences
# should be smart enough to remove obvious footer hyperlinks (i.e. if only symbols and numbers within string.split(' ') and is ahref)
# should also be able to auto-extract keywords if they are in bold (this would be format dependent requiring a data entry)

(<Sentence: The Semantic Web is an extension of the World Wide Web through standards by the World Wide Web Consortium (W3C).>,
 <Sentence: [1] The standards promote common data formats and exchange protocols on the Web, most fundamentally the Resource Description Framework (RDF).>,
 <Sentence: According to the W3C, "The Semantic Web provides a common framework that allows data to be shared and reused across application, enterprise, and community boundaries".>,
 <Sentence: [2] The Semantic Web is therefore regarded as an integrator across different content, information applications and systems.>)

In [60]:
import io

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage


def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)

        text = retstr.getvalue()
        
    device.close()
    retstr.close()
    return text

In [122]:
def find_first_nonspace(string, char=' '):
    for ii, letter in enumerate(string):
        if letter!=char:
            return ii
        
def in_ignores(char):
    ignore_chars = {'\x0c', ''}  # pdf print next sheet flag
    return not (char in ignore_chars)


bullets = {u"\u2022"}
punctuation = set(list('.?!'))

sentences = convert_pdf_to_txt("pdf-sample.pdf").split('\n')
prx_sentences = []
cont_sent = ''
for si, sentence in enumerate(sentences):
    # Skip any empty sentences to be removed later
    if len(sentence)>0:
        if sentence[0] in bullets:
            start_ind = find_first_nonspace(sentence[1:], ' ')+1
            sentence = sentence[start_ind:]

        # Sentence ending in punctuation logic
        if sentence[-1] in punctuation and cont_sent:
            #print(cont_sent+' '+sentence+'\n')
            prx_sentences.append(cont_sent+' '+sentence)
            cont_sent = ''
        elif sentence[-1] in punctuation:
            prx_sentences.append(sentence)
            cont_sent = ''
        elif cont_sent:
            cont_sent += ' ' + sentence
        elif si == len(sentences)-1:
            prx_sentences.append(sentence)
        else:
            cont_sent = sentence
        
# Remove any sentences with weird unicode issues
prx_sentences = list(filter(in_ignores, prx_sentences))
for si, sentence in enumerate(prx_sentences):
    print(si, sentence, '\n')


0 Adobe Acrobat PDF Files Adobe® Portable Document Format (PDF) is a universal file format that preserves all of the fonts, formatting, colours and graphics of any source document, regardless of the application and platform used to create it. 

1 Adobe PDF is an ideal format for electronic document distribution as it overcomes the problems commonly encountered with electronic file sharing. 

2 Anyone, anywhere can open a PDF file. All you need is the free Adobe Acrobat Reader. Recipients of other file formats sometimes can't open files because they don't have the applications used to create the documents. 

3 PDF files always print correctly on any printing device. 

4 PDF  files  always  display  exactly  as  created,  regardless  of  fonts,  software,  and operating systems. Fonts, and graphics are not lost due to platform, software, and version incompatibilities. 

5 The  free  Acrobat  Reader  is  easy  to  download  and  can  be  freely  distributed  by anyone. 

6 Compact  PDF  f

In [114]:
from sumyplus.models.dom import Sentence, Paragraph, ObjectDocumentModel

In [137]:
token = Tokenizer(LANGUAGE)
#base_docparse = DocumentParser(token)

def create_odm(text, tokenizer):
    """  """
    paragraphs = []
    sentences = []
    for sentence in text:
        sentences.append(Sentence(sentence, tokenizer))
    
    paragraphs.append(Paragraph(sentences))

    return ObjectDocumentModel(paragraphs)

dom = create_odm(prx_sentences, token)
dom

<DOM with 1 paragraphs>

In [138]:
dom.__dict__

{'_paragraphs': (<Paragraph with 0 headings & 7 sentences>,)}

In [144]:
for sentence in summarizer(dom, 3):
    print(sentence, '\n')

Adobe Acrobat PDF Files Adobe® Portable Document Format (PDF) is a universal file format that preserves all of the fonts, formatting, colours and graphics of any source document, regardless of the application and platform used to create it. 

PDF  files  always  display  exactly  as  created,  regardless  of  fonts,  software,  and operating systems. Fonts, and graphics are not lost due to platform, software, and version incompatibilities. 

The  free  Acrobat  Reader  is  easy  to  download  and  can  be  freely  distributed  by anyone. 



In [145]:
dom.sentences

(<Sentence: Adobe Acrobat PDF Files Adobe® Portable Document Format (PDF) is a universal file format that preserves all of the fonts, formatting, colours and graphics of any source document, regardless of the application and platform used to create it.>,
 <Sentence: Adobe PDF is an ideal format for electronic document distribution as it overcomes the problems commonly encountered with electronic file sharing.>,
 <Sentence: Anyone, anywhere can open a PDF file. All you need is the free Adobe Acrobat Reader. Recipients of other file formats sometimes can't open files because they don't have the applications used to create the documents.>,
 <Sentence: PDF files always print correctly on any printing device.>,
 <Sentence: PDF  files  always  display  exactly  as  created,  regardless  of  fonts,  software,  and operating systems. Fonts, and graphics are not lost due to platform, software, and version incompatibilities.>,
 <Sentence: The  free  Acrobat  Reader  is  easy  to  download  and  