## The purpose of this exercise is to download an article from a Washington Post URL and use natural language processing (NLP) to summarize it in three sentences.

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize # Split text into sentences, and sentences into words.
from nltk.corpus import stopwords # Used to filter out common words ("and", "but", "I", "the", etc.)
from string import punctuation # Used to filter out punctuation.
from collections import defaultdict # Dictionary that creates entries for new keys instead of throwing a KeyError.
from heapq import nlargest # Returns the 'n' largest items from a list based on a method.

# Uses ntlk to summarize a set of text based on word frequency.
class FrequencySummarizer:
    
    def __init__(self, min_cut=0.1, max_cut=0.9):
        # Words with a frequency term outside of the range between min_cut and max_cut will be ignored.
        self._min_cut = min_cut
        self._max_cut = max_cut
        # Common words and symbols to be ignored for summarization.
        self._stopwords = set(stopwords.words('english')+list(punctuation))
    
    # Accepts a list of sentences. 
    # Returns a dictionary of word:frequency pairs.
    def _compute_frequencies(self, word_sent):
        # Create a defaultdict (see import at top) with 0 as the default value.
        freq = defaultdict(int)
        # Loop through each word of each sentence to count their frequencies.
        for sentence in word_sent:
            for word in sentence:
                if word not in self._stopwords:
                    # Note: Using the regular dictionary would require checking if the key exists first.
                    # because freq is a defaultdict, it will create an entry the first time a word is added.
                    freq[word] += 1
        # Normalize the frequencies so they fall between 0 and 1 by dividing the frequency of every word by
        # the largest frequency in the dictionary.
        max_n = float(max(freq.values()))
        for word in list(freq.keys()):
            freq[word] = freq[word]/max_n 
            # Frequencies outside of max_cut and min_cut are also filtered out in this step.
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
        # Return the dictionary of frequencies.
        return freq
    
    # Accepts text and the number of sentences the summary should contain.
    # Returns a summary of the text.
    def summarize(self, text, n):
        # Split the text into sentences (see import at top)
        sentences = sent_tokenize(text)
        # Sanity check to make sure the summary is less than the length of the article.
        assert n <= len(sentences)
        # Compile each word of each sentence into a single dictionary.
        # For consistency, they're all converted to lowercase.
        word_sent = [word_tokenize(sentence.lower()) for sentence in sentences]
        # Compute the word frequencies (see above method) and store in member variable _freq.
        self._freq = self._compute_frequencies(word_sent)
        # Create a defaultdict to rank sentences by frequency.
        rankings = defaultdict(int)
        # First, enumerate each of the stentences for easier ranking.
        for i,sentence in enumerate(word_sent):
            # For each word in a sentence...
            for word in sentence:
                # ...if that word wasn't a stopword...
                if word in self._freq:
                    # ...add its frequency to the ranking.
                    rankings[i] += self._freq[word]
        # Fetch the indexes of the largest n sentences.
        # nlargest (see import at top) is given ranking.get() to know which values to rank.
        sentences_index = nlargest(n, rankings, key=rankings.get)
        # Return a list containing the top sentences.
        return [sentences[j] for j in sentences_index]

In [2]:
import urllib.request # Downloads URLs.
from bs4 import BeautifulSoup # Parses html pages in an easy-to-use way.

# Accepts a URL for a Washington Post article.
# Returns a title,body pair containing only text from the article.
def get_text_from_wapo_url(url):
    # Download the URL. Assumed to use utf-8 encoding.
    page = urllib.request.urlopen(url).read().decode('utf8')
    # Initialize a BeautifulSoup (see imports above) with url's text.
    soup = BeautifulSoup(page, 'html.parser')
    # WaPo wraps all of their articles in <article> tags, which is used
    # to search for them here via BeautifulSoup.
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
#     print(text)
    # Since other stuff is usually included between the <article> tags,
    # another pass is necessary to filter to just article text. This is
    # done by searching that text for <p> (paragraph) tags.
#     soup2 = BeautifulSoup(text, 'html.parser')
#     text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
#     print(text)
    #Return the title of the article and its text.
    return soup.title.text, text

In [6]:
someUrl = input()
urlText = get_text_from_wapo_url(someUrl)
fs = FrequencySummarizer()
summary = fs.summarize(urlText[1], 3)
for sentence in summary:
    print(sentence)

https://www.washingtonpost.com/opinions/global-opinions/after-a-mere-25-years-the-triumph-of-the-west-is-over/2016/12/01/deebe24c-b7f7-11e6-959c-172c82123976_story.html?hpid=hp_no-name_opinion-card-d%3Ahomepage%2Fstory
And even as Europe tires of the sanctions imposed on Russia for its rape of Ukraine, President Obama’s much-touted “isolation” of Russia has ignominiously dissolved, as our secretary of state repeatedly goes cap in hand to Russia to beg for mercy in Syria.
Read more here:   Fred Hiatt: The U.S. steps back from the world stage, and the consensus for leadership dissolves   Anne Applebaum: After Brussels, the West must reject dangerous isolationism   Jim Hoagland: Obama has ignored our European allies for too long   David Ignatius: What President Trump’s foreign policy will look like    
Obama ordered retreat because he’s always felt the U.S. was not good enough for the world, too flawed to have earned the moral right to be the world hegemon.
