# Part 1: Autosummarise a Washington Post news article using rule-based methods

In [3]:
import requests
from bs4 import BeautifulSoup

In [17]:
def getTextWaPo(url):
    r = requests.get(url)
    r.encoding = 'utf-8'
    html = r.text
    soup = BeautifulSoup(html,"lxml")
    text = " ".join(map(lambda p:p.text, soup.find_all('article')))
    return text

url = "https://www.washingtonpost.com/politics/trump-punts-vote-on-health-care-bill-until-after-next-years-elections/2019/04/02/068305d2-552c-11e9-8ef3-fbd41a2ce4d5_story.html?utm_term=.843f517b2f18"
getTextWaPo(url)

'      By  John Wagner and          John Wagner National reporter leading The Post\'s breaking political news team  Email  Bio  Follow         Erica Werner          Erica Werner Congressional reporter focusing on economic policy  Email  Bio  Follow         April 2 at 3:02 PM  President Trump abandoned plans to press for a vote on a bill to replace the Affordable Care Act ahead of next year’s elections following a conversation with Senate Majority Leader Mitch McConnell, the Kentucky Republican said Tuesday. McConnell told reporters that he and Trump had “a good conversation” Monday afternoon in which he said that Senate Republicans had no intention of trying to overhaul President Obama’s signature health-care law during a campaign season — a move many in the GOP saw as politically perilous, given that the issue helped Democrats in last year’s midterm elections. “I made it clear to him we were not going to be doing that in the Senate,” McConnell said, also pointing out the difficulty in

In [14]:
# soup.find() only returns the first element that matches the <article> tag
# soup.findall() returns all

# there is one paragraph of html code here: I should figure out a way to remove it

# if(typeof window.powaBoot==="function")window.powaBoot();else{window.powaBootBoot=window.powaBootBoot||function(){if(null===document.querySelector(\'script[src*\\x3d"powaBoot.js"]\')){var script=document.createElement("script");var scripts=document.getElementsByTagName("script")[0];script.src="https://d1pz6dax0t5mop.cloudfront.net/v/1.5.6/powaBoot.js";script.async=true;scripts.parentNode.insertBefore(script,scripts);window.havePowaBoot=true}};window.powaBootBoot()}; 

In [18]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [39]:
sents = sent_tokenize(text)
word_sent = word_tokenize(text.lower())

# remove stopwords
_stopwords = set(stopwords.words("english") + list(punctuation)+['’','“','”','—'])
word_sent = [word for word in word_sent if word not in _stopwords]

In [40]:
# find the most frequent words in the article
from nltk.probability import FreqDist
freq = FreqDist(word_sent)

FreqDist({'trump': 22, 'said': 19, 'senate': 11, 'republicans': 10, 'house': 10, 'president': 9, 'would': 9, '—': 8, "''": 8, 'elections': 7, ...})

In [41]:
from heapq import nlargest
# use nlargest to find the top 10 most frequent keywords in the article
nlargest(10,freq,key=freq.get)

['trump',
 'said',
 'senate',
 'republicans',
 'house',
 'president',
 'would',
 '—',
 "''",
 'elections']

In [45]:
from collections import defaultdict
ranking  = defaultdict(int)

# find the most sentences with the most frequent words, and store result into a defaultdict where the keys are the indices of the sentences, and the values are the significance scores for the sentences (which is the sum of the importance of words in that sentence)
for i, sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] +=freq[w]
            
# find the top 4 sentences from the ranking dictionary

sent_index = nlargest(4, ranking,key=ranking.get)
sent_index
[sents[j] for j in sorted(sent_index)]


["      By  John Wagner and          John Wagner National reporter leading The Post's breaking political news team  Email  Bio  Follow         Erica Werner          Erica Werner Congressional reporter focusing on economic policy  Email  Bio  Follow         April 2 at 3:02 PM  President Trump abandoned plans to press for a vote on a bill to replace the Affordable Care Act ahead of next year’s elections following a conversation with Senate Majority Leader Mitch McConnell, the Kentucky Republican said Tuesday.",
 'McConnell told reporters that he and Trump had “a good conversation” Monday afternoon in which he said that Senate Republicans had no intention of trying to overhaul President Obama’s signature health-care law during a campaign season — a move many in the GOP saw as politically perilous, given that the issue helped Democrats in last year’s midterm elections.',
 '“It will be truly great HealthCare that will work for America.” [For Trump’s ‘Party of Healthcare,’ there is no health

In [49]:
# putting everything together in one function:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from string import punctuation
from collections import defaultdict
from heapq import nlargest

def summarize(text,n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents) # this checks whether the # of summary lines is smaller than the # of sentences in the article
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words("english") + list(punctuation)+['’','“','”','—'])
    
    word_sent = [word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    
    ranking = defaultdict(int)
    
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i]+= freq[w]
    
    sent_index = nlargest(4, ranking,key=ranking.get)
    return [sents[j] for j in sorted(sent_index)]
    
    

In [50]:
summarize(text,3)

["      By  John Wagner and          John Wagner National reporter leading The Post's breaking political news team  Email  Bio  Follow         Erica Werner          Erica Werner Congressional reporter focusing on economic policy  Email  Bio  Follow         April 2 at 3:02 PM  President Trump abandoned plans to press for a vote on a bill to replace the Affordable Care Act ahead of next year’s elections following a conversation with Senate Majority Leader Mitch McConnell, the Kentucky Republican said Tuesday.",
 'McConnell told reporters that he and Trump had “a good conversation” Monday afternoon in which he said that Senate Republicans had no intention of trying to overhaul President Obama’s signature health-care law during a campaign season — a move many in the GOP saw as politically perilous, given that the issue helped Democrats in last year’s midterm elections.',
 '“It will be truly great HealthCare that will work for America.” [For Trump’s ‘Party of Healthcare,’ there is no health

# Part 2: Classifying a text using Machine Learning

In [None]:
# Feature extraction with bag of words
# k-means clustering on themes

# Objective: to build text corpus through collecting artciles from a blog

## Step 1: create a link of all posts in this blogspot site

In [1]:
import requests
from bs4 import BeautifulSoup

def getAllDoxyDonkeyPosts(url,links):
    r = requests.get(url)
    r.encoding = "utf-8"
    html = r.text
    soup = BeautifulSoup(html,"lxml")
    for a in soup.findAll("a"): # find all links on the blog pages
        try:
            url = a["href"]
            title = a["title"]
            if title == "Older Posts":
                print(title, url)
                links.append(url)
                getAllDoxyDonkeyPosts(url,links)
        except:
            title = ""
    return links


blogURL = "https://doxydonkey.blogspot.com/"
links = []
getAllDoxyDonkeyPosts(blogURL, links)
                




Older Posts https://doxydonkey.blogspot.com/search?updated-max=2017-05-23T19:53:00-07:00&max-results=7
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2017-05-14T19:02:00-07:00&max-results=7&start=7&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2017-05-02T19:43:00-07:00&max-results=7&start=14&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2017-04-17T19:26:00-07:00&max-results=7&start=21&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2017-04-10T18:56:00-07:00&max-results=7&start=28&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2017-03-30T19:57:00-07:00&max-results=7&start=35&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2017-03-20T19:47:00-07:00&max-results=7&start=42&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2017-03-02T17:42:00-08:00&max-results=7&start=49&by-date=false
Older Posts http

Older Posts https://doxydonkey.blogspot.com/search?updated-max=2015-05-04T20:23:00-07:00&max-results=7&start=455&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2015-04-23T20:19:00-07:00&max-results=7&start=462&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2015-04-14T19:40:00-07:00&max-results=7&start=469&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2015-04-05T20:22:00-07:00&max-results=7&start=476&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2015-03-24T20:12:00-07:00&max-results=7&start=483&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2015-03-15T20:41:00-07:00&max-results=7&start=490&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2015-03-03T19:30:00-08:00&max-results=7&start=497&by-date=false
Older Posts https://doxydonkey.blogspot.com/search?updated-max=2015-02-22T19:55:00-08:00&max-results=7&start=50

['https://doxydonkey.blogspot.com/search?updated-max=2017-05-23T19:53:00-07:00&max-results=7',
 'https://doxydonkey.blogspot.com/search?updated-max=2017-05-14T19:02:00-07:00&max-results=7&start=7&by-date=false',
 'https://doxydonkey.blogspot.com/search?updated-max=2017-05-02T19:43:00-07:00&max-results=7&start=14&by-date=false',
 'https://doxydonkey.blogspot.com/search?updated-max=2017-04-17T19:26:00-07:00&max-results=7&start=21&by-date=false',
 'https://doxydonkey.blogspot.com/search?updated-max=2017-04-10T18:56:00-07:00&max-results=7&start=28&by-date=false',
 'https://doxydonkey.blogspot.com/search?updated-max=2017-03-30T19:57:00-07:00&max-results=7&start=35&by-date=false',
 'https://doxydonkey.blogspot.com/search?updated-max=2017-03-20T19:47:00-07:00&max-results=7&start=42&by-date=false',
 'https://doxydonkey.blogspot.com/search?updated-max=2017-03-02T17:42:00-08:00&max-results=7&start=49&by-date=false',
 'https://doxydonkey.blogspot.com/search?updated-max=2017-02-21T19:13:00-08:00&m

## Step 2: crawl content of all posts on this site

In [None]:
def getDoxyDonkeyText(URL):
    r = requests.get(URL)
    r.encoding = "utf-8"
    html = r.text
    soup = BeautifulSoup(html,"lxml")
    divs = soup.findAll("div",{"class":"post-body"}) #find all divs on the blog post page which has the class name "post-body"
    posts = []
    for div in divs:
        for i in div.findAll("li"):
            posts.append(i)
        #posts += list(map(lambda p:p.text.encode("ascii",errors="replace").replace("?"," "),div.findAll("li")))
    return posts


allPosts = []

for link in links:
    allPosts +=getDoxyDonkeyText(link)

In [None]:
allPosts