In [187]:
# Objective: Take in the URL of a newspaper article (from the Washington Post) [aside: I'm so trying this with the DM as well], 
# and automatically summarise it in three sentences.
#
# HOW? Using the NLTK, of course!

In [188]:
# So what steps will we take?
#
# 1. Download the contents of the URL.
# 2. Extract the Article from all the other HTML that is in the webpage.
# 3. Figure out which the 3 most important sentences in the article are.
#    i.  Find the most common words in the article, excluding stopwords.
#    ii. Find the sentence in which those most common words occur most often.

In [189]:
import nltk
from collections import defaultdict
from urllib.request import urlopen
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest

In [190]:
def getPageText( url, token ):
    
    try:
        page = urlopen(url).read().decode('utf8')
    except:
        return( None, None )
    
    soup = BeautifulSoup( page, 'html.parser' )
    if soup is None:
        return( None, None )
    
    text = ""
    if soup.find_all(token) is not None:
        text  = ' '.join( map(lambda p: p.text.replace(u'\xa0', ' '), soup.find_all(token)) )
        soup2 = BeautifulSoup( text )
        if soup2.find_all('p') is not None:
            text = ''.join( map(lambda p: p.text, soup.find_all('p')) )
            
    return text, soup.title.text

In [191]:
class FrequencySummarizer:
    
    def __init__( self, min_cut=0.1, max_cut=0.9 ):
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set( stopwords.words('english')
                             + list(punctuation) 
                             + [u"'s",'"']
                             + ['‘','’','-','–','“','”'] ) # these were added as a result of a successful run!
        self._freq_output = None
        
    def _compute_frequencies( self, word_sent, customStopWords=None ):
        freq = defaultdict(int)

        if customStopWords is None:
            stopwords = set( self._stopwords )
        else:
            stopwords = set( customStopWords ).union( self._stopwords )

        for sentence in word_sent:
            for word in sentence:
                if word not in stopwords:
                    freq[word] += 1
        m = float( max(freq.values()) )
        for words in freq.keys():
            freq[word] = freq[word]/m
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
        
        self._freq_output = freq
        return freq

    def summarize( self, article, n ):
        text  = article[0]
        title = article[1]

        sentences = sent_tokenize( text )
        word_sent = [ word_tokenize(sent.lower()) for sent in sentences ]

        self._freq = self._compute_frequencies( word_sent )
        ranking    = defaultdict(int)
        
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
                    
        sentences_index = nlargest( n, ranking, key=ranking.get )
        return [ sentences[j] for j in sentences_index ]

    def get_freq_output( self ):
        return self._freq_output
        
    

In [192]:
# a slight diversion, as we need to go to python3, and the Washington Post is now behind a paywall.
# because of twitter - we have a replacement! Thanks, Lucy..!
html = urlopen('http://www.lucygoesdating.com/2018/01/the-one-with-long-hair-part-1-soulmates/').read().decode('utf8') 
soup = BeautifulSoup( html, 'html.parser' )

article = soup.find('div', { 'class': 'entry-content' })

# this is one way to do the extract and join...
paras   = ' '.join([ p.text.replace(u'\xa0', '') for p in article.find_all('p') ])
#print( paras )

# and this is another...
text    = ' '.join(map( lambda p: p.text.replace(u'\xa0', ' ').replace('  ', ' '), article.find_all('p') ))
print( text )

# and it works! bloody marvellous.

It’s now been nearly three years since The Ex walked out, and Lucy is fucking fed up with dating apps. All she ever seems to do is spend a hundred hours a day swiping, waiting for responses from guys who can’t be arsed, and going on mediocre dates with morons with whom she has nothing in common. Lucy is about 99% convinced that dating apps are going to cause the obliteration of the entire human race. So she decides to try a different tack, and give paid-for dating websites another chance instead. After all, she met The Ex on Match.com, and that worked out ok… at least, until it didn’t. And in the first year after he left, she joined Guardian Soulmates and met two nice guys. Neither lasted long term, but at least they were good enough to play with for a few months – and she even got some actual sex into the bargain. The apps, on the other hand, have been an unmitigated disaster. In the last 18 months Lucy has probably been on close to 50 first dates, and not a single one has got anywher

In [197]:
# right, so we'll skip over the crawler function for now, given we have some source material...
# does our summarizer work?

s = FrequencySummarizer()

# I'm reluctant to even run this...
key3 = s.summarize( [text, 'Article Title'], 3 )
print( ' '.join(key3) )

Lucy is able to filter by age, height, religion, level of education, whether he has kids, even things like hair colour and eye colour – it’s a bit like one of those Build-A-Bear workshops for kids, except instead of customising a stuffed animal, Lucy is designing a toy for more adult-style cuddles. Start dropping hints early, as it were, so that later on when they’ve been dating for a while and she asks him to cut his hair before introducing him to any of her friends, it doesn’t come as quite so much of a shock…  But Jules doesn’t actually seem too bothered by Lucy’s confession. The lack of information, the lack of commitment, the inane pointless messages, the ghosting – in short, the mind-blowing numbers of total fuckboys lurking therein… Nope, she thinks, dating apps are the seventh circle of hell and I would rather chop off both my thumbs than swipe even one more time.


In [194]:
# I'm curious to know what our dictionary determined was important...
dictionary  = s.get_freq_output()
sortedwords = [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)] 
for key, value in sortedwords:
    if value > 5:
        print("%s: %s" % (key, value ))
# this actually taught us that we need to remove some more punctuation... 
# update: and holy shit, it actually affected the summary above! good stuff...

lucy: 27
hair: 11
dating: 10
long: 9
profile: 8
jules: 8
apps: 6
man: 6
message: 6
