In [2]:
# Objective: Take in the URL of a newspaper article (from the Washington Post) [aside: I'm so trying this with the DM as well], 
# and automatically summarise it in three sentences.
#
# HOW? Using the NLTK, of course!

In [4]:
# So what steps will we take?
#
# 1. Download the contents of the URL.
# 2. Extract the Article from all the other HTML that is in the webpage.
# 3. Figure out which the 3 most important sentences in the article are.
#    i.  Find the most common words in the article, excluding stopwords.
#    ii. Find the sentence in which those most common words occur most often.

In [8]:
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize

In [10]:
def getWashingtonPostText( url, token ):
    
    try:
        page = urllib2.urlopen(url).read().decode('utf8')
    except:
        return( None, None )
    
    soup = BeautifulSoup( page )
    if soup is None:
        return( None, None )
    
    text = ""
    if soup.find_all(token) is not None:
        text  = ''.join( map(lambda p: p.txt, soup.find_all(token)) )
        soup2 = BeautifulSoup( text )
        if soup2.find_all('p') is not None:
            text = ''.join( map(lambda p: p.text, soup.find_all('p')) )
            
    return text, soup.title.text

In [13]:
class FrequencySummarizer:
    
    def __init__( self, min_cut=0.1, max_cut=0.9 ):
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set( stopwords.words('english')
                             + list(punctuation) 
                             + [u"'s", '"'] )
        
    def _compute_frequencies( self, word_sent, customStopWords=None ):
        freq = defaultdict(int)

        if customStopWords is None:
            stopwords = set(self._stopwords)
        else:
            stopwords = set( customStopWords ).union( self._stopwords )

        for sentence in word_sent:
            for word in sentence:
                if words not in stopwords:
                    freq[word] += 1
        m = float( max(freq.values()) )
        for words in freq.keys():
            freq[word] = freq[word]/m
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]

        return freq

    def summarize( self, article, n ):
        text  = article[0]
        title = article[1]

        sentences = sent_tokenize( text )
        word_sent = [ word_tokenize(sent.lower()) for sent in sentences ]

        self._freq = self._compute_frequencies( word_sent )
        ranking    = defaultdict(int)
        
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
                    
        sentences_index = nlargest( n, ranking, key=ranking.get )
        return [ sents[j] for j in sentences_index ]
                
        
    