In [1]:
import re
import urllib.request
#
from bs4 import BeautifulSoup
from collections import Counter
#
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zach\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#shortword = re.compile(r'\W*\b\w{1,2}\b')
#shortword.sub('', g)

In [3]:
class stopWords(object):
    """  """
    def __init__(self):
        titles = set([word.title() for word in stopwords.words('english')])
        uppers = set([word.upper() for word in stopwords.words('english')])
        self.stopword_set = set(stopwords.words('english')) | titles | uppers

    def remove_stopwords(self, text):
        """ """
        text = text.split(' ')
        return ' '.join([word for word in text if word not in self.stopword_set])

In [4]:
class Document(object):
    """  """
    def __init__(self, source, source_type='html'):
        self.source = source
        self.source_type = source_type
        
        # Placeholders
        self.raw_doc = None
        self.paragraphs = None
        self.processed_paragraphs = None
        self.counters = None
        self.wordcount = None
        
        # Static elements  # TODO move to module
        self.banned_strs = '()[]{}@#$%^&*+=,`\"\'\\1234567890'
        self.stopwords = stopWords()
        
        if source_type == 'html':
            """ Source is page url """
            self.collect_paragraphs()
            self.process_paragraphs()
        
    def collect_paragraphs(self):
        """ """
        with urllib.request.urlopen(self.source) as page:
            self.raw_doc = BeautifulSoup(page, "html.parser")
        
        # Get all paragraphs and add text to document
        paragraphs = self.raw_doc.find_all('p')
        for pind, paragraph in enumerate(paragraphs):
            paragraphs[pind] = paragraph.text
            
        # Remove empty paragraphs
        paragraphs = list(filter(None, paragraphs)) # fastest
        
        self.paragraphs = paragraphs
        self.processed_paragraphs = [None]*len(self.paragraphs)
        self.counters = [None]*len(self.paragraphs)
        
    def process_paragraphs(self):
        """ """
        for pind, paragraph in enumerate(self.paragraphs):
            int_par = paragraph.translate({ord(c): None for c in self.banned_strs})
            self.processed_paragraphs[pind] = \
                self.stopwords.remove_stopwords(int_par)
                
    def create_counters(self):
        """ """
        for ii, _ in enumerate(self.counters):
            poi = self.processed_paragraphs[ii]
            no_punc = poi.translate({ord(c): None for c in '.?!'})
            lower = [word.lower() for word in no_punc.split(' ')]
            # Remove empty words
            words = list(filter(None, lower)) # fastest
            
            self.counters[ii] = Counter(words)
            
        self.wordcount = sum(self.counters, Counter())

In [5]:
page_url = 'https://en.wikipedia.org/wiki/Semantic_Web'
page_type = 'html'

In [6]:
document = Document(page_url, source_type=page_type)

In [7]:
document.processed_paragraphs

['Semantic Web extension World Wide Web standards World Wide Web Consortium WC. standards promote common data formats exchange protocols Web fundamentally Resource Description Framework RDF. According WC Semantic Web provides common framework allows data shared reused across application enterprise community boundaries. Semantic Web therefore regarded integrator across different content information applications systems.',
 'term coined Tim Berners-Lee web data data web processed machines—that one much meaning machine-readable. critics questioned feasibility proponents argue applications industry biology human sciences research already proven validity original concept.',
 'Berners-Lee originally expressed vision Semantic Web follows:',
 'dream Web computers become capable analyzing data Web\xa0– content links transactions people computers. Semantic Web makes possible yet emerge day-to-day mechanisms trade bureaucracy daily lives handled machines talking machines. intelligent agents peopl

In [8]:
document.create_counters()

In [9]:
document.wordcount.most_common(15)

[('web', 79),
 ('semantic', 47),
 ('data', 30),
 ('knowledge', 14),
 ('html', 13),
 ('information', 11),
 ('one', 11),
 ('content', 10),
 ('example', 10),
 ('documents', 9),
 ('world', 8),
 ('wide', 8),
 ('wc', 8),
 ('berners-lee', 8),
 ('research', 8)]

In [24]:
# Now we have important words, so we can trim words that only appear x number of times in document of length y 
#  for now this is static at 2 instances
document_length = sum([len(paragraph) for paragraph in document.paragraphs])
print(document_length)
wordcount_dict = dict(document.wordcount)
for key in list(wordcount_dict.keys()):
    num_instances = wordcount_dict[key]
    if num_instances < 3:
        wordcount_dict.pop(key)
        
wordcount_dict

18630


{'semantic': 47,
 'web': 79,
 'world': 8,
 'wide': 8,
 'standards': 4,
 'consortium': 3,
 'wc': 8,
 'common': 3,
 'data': 30,
 'formats': 3,
 'resource': 5,
 'description': 3,
 'framework': 3,
 'rdf': 6,
 'according': 3,
 'across': 3,
 'application': 3,
 'content': 10,
 'information': 11,
 'applications': 6,
 'term': 5,
 'coined': 3,
 'tim': 5,
 'berners-lee': 8,
 'one': 11,
 'much': 6,
 'meaning': 3,
 'machine-readable': 5,
 'feasibility': 3,
 'human': 7,
 'research': 8,
 'concept': 4,
 'computers': 5,
 'people': 6,
 'machines': 5,
 'agents': 3,
 'described': 4,
 'markup': 7,
 'following': 5,
 'example': 10,
 'text': 5,
 'graph': 4,
 'using': 7,
 'triples': 3,
 'triple': 5,
 'edge': 6,
 'element': 4,
 'second': 3,
 'eg': 3,
 'result': 4,
 'given': 5,
 'dereferenced': 3,
 'linked': 3,
 'uri': 4,
 'document': 3,
 'edges': 4,
 'http://schemaorg/person': 3,
 'documents': 9,
 'rdfa': 3,
 'owl': 4,
 'semantics': 3,
 'network': 3,
 'knowledge': 14,
 'pages': 5,
 'metadata': 4,
 'automated': 

In [10]:
# We also need to create a n-length phrase list so phrases like "Semantic Web"
#   and "World Wide Web" are extracted and treated separately from 
#   words like "web" by itself

# I think the second can be done early on by tracking what words are removed. 
# If a stopword separates several words with higher value, a phrase is generated.
# For example (Note sw=stopword, vw=valueword):
#    "The Semantic Web is an extension of the World Wide Web"
#      sw   vw      vw sw sw    vw     sw  sw   sw   sw   sw
# Extracted phrases:
#    - Semantic Web
#    - extension (removed as singular word)
#    - World Wide Web