In [55]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import urllib.request
from bs4 import BeautifulSoup

In [59]:
class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))
      
    def _compute_frequencies(self, word_sent):
        
        freq = defaultdict(int)       
        for s in word_sent:
            for word in s:            
                if word not in self._stopwords:               
                    freq[word] += 1

        m = float(max(freq.values()))
        for w in list(freq.keys()):
            freq[w] = freq[w]/m
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]
        return freq

    def summarize(self, text, n):
       
        sents = sent_tokenize(text)
        assert n <= len(sents)
       
        word_sent = [word_tokenize(s.lower()) for s in sents]
        
        self._freq = self._compute_frequencies(word_sent)
       
        ranking = defaultdict(int)
        
        for i,sent in enumerate(word_sent):
           
            for w in sent:
                if w in self._freq:
                   
                    ranking[i] += self._freq[w]
        sents_idx = nlargest(n, ranking, key=ranking.get)
        
        return [sents[j] for j in sents_idx]

In [64]:
def get_only_text_washington_post_url(url):
    page = urllib.request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page,"lxml")
    
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
   
    soup2 = BeautifulSoup(text,"lxml")
    text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    
    return soup.title.text, text

In [65]:
# Enter the washington post article url you want to summarize
someUrl = "https://www.washingtonpost.com/politics/trump-sought-release-of-classified-russia-memo-putting-him-at-odds-with-justice-department/2018/01/27/a00f2a4c-02bb-11e8-9d31-d72cf78dbeee_story.html?hpid=hp_hp-top-table-main_trumpreconstruct546pm%3Ahomepage%2Fstory&utm_term=.9e123bc03dc7"
print("\nURL :" +str(someUrl)+"\n")

textOfUrl = get_only_text_washington_post_url(someUrl)
print("\nText :" +str(textOfUrl)+"\n")

fs = FrequencySummarizer()
# 3 is the numbe rof lines in the summary.
summary = fs.summarize(textOfUrl[1], 3)
print("\nSummary :" +str(summary)+"\n")


URL :https://www.washingtonpost.com/politics/trump-sought-release-of-classified-russia-memo-putting-him-at-odds-with-justice-department/2018/01/27/a00f2a4c-02bb-11e8-9d31-d72cf78dbeee_story.html?hpid=hp_hp-top-table-main_trumpreconstruct546pm%3Ahomepage%2Fstory&utm_term=.9e123bc03dc7


Text :('Trump sought release of classified Russia memo, putting him at odds with Justice Department - The Washington Post', 'On Wednesday, as Republicans were clamoring to make public a secret document they think will undercut the investigation into Russian meddling, President Trump made clear his desire: Release the memo. Trump’s directive was at odds with his own Justice Department, which had warned that releasing the classified memo written by congressional Republicans would be “extraordinarily reckless” without an official review. Nevertheless, White House Chief of Staff John F. Kelly relayed the president’s view to Attorney General Jeff Sessions — although the decision to release the document ultim