## Python 3 version of the Pluralsight course: Getting Started with Natural Language Processing with Python

# Part 1: Autosummarise a Washington Post news article using rule-based methods

In [22]:
import requests
from bs4 import BeautifulSoup

In [38]:
def getTextWaPo(url):
    r = requests.get(url)
    r.encoding = 'utf-8'
    html = r.text
    soup = BeautifulSoup(html,"lxml")
    text = " ".join(map(lambda p:p.text, soup.find_all('article')))
    return text

url = "https://www.washingtonpost.com/technology/2019/05/02/facebook-bans-extremist-leaders-including-louis-farrakhan-alex-jones-milo-yiannopoulos-being-dangerous/?utm_term=.c8fe12bd52c7"
text = getTextWaPo(url)

**soup.find()** only returns the first element that matches the **article** tag
    
**soup.findall()** returns all

In [39]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [40]:
sents = sent_tokenize(text)
word_sent = word_tokenize(text.lower())

# remove stopwords
_stopwords = set(stopwords.words("english") + list(punctuation)+['’','“','”','—',"window.powaBoot"])
word_sent = [word for word in word_sent if word not in _stopwords]

In [41]:
# find the most frequent words in the article
from nltk.probability import FreqDist
freq = FreqDist(word_sent)

In [42]:
from heapq import nlargest
# use nlargest to find the top 10 most frequent keywords in the article
nlargest(10,freq,key=freq.get)

['facebook',
 'said',
 'hate',
 'banned',
 'jones',
 'white',
 'company',
 'infowars',
 'speech',
 'platforms']

In [43]:
from collections import defaultdict
from heapq import nlargest
ranking  = defaultdict(int)

# find the most sentences with the most frequent words, and store result into a defaultdict where the keys are the indices of the sentences, and the values are the significance scores for the sentences (which is the sum of the importance of words in that sentence)
for i, sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] +=freq[w]
            
# find the top 3 sentences from the ranking dictionary

sent_index = nlargest(3, ranking,key=ranking.get)
sent_index
[sents[j] for j in sorted(sent_index)]


['Facebook said on Thursday it has permanently banned several far-right and anti-Semitic figures and organizations, including Nation of Islam leader Louis Farrakhan, Infowars host Alex Jones, Milo Yiannopoulos and Laura Loomer, for being “dangerous,” a sign that the social network is more aggressively enforcing its hate speech policies under pressure from civil rights groups.',
 'Angelo Carusone, president of Media Matters, an organization that has long advocated for more enforcement against white supremacists, said Facebook has been lax against enforcing its policies against hate speech on these accounts because the company doesn’t want to deal with the right-wing blowback.',
 'Madihha Ahussain, special counsel for anti-Muslim bigotry with the advocacy group Muslim Advocates, said that individuals like Loomer, Jones and Yiannopoulos have used social media platforms to broadcast dangerous hate speech and conspiracies targeting Muslims, Jews and others.']

In [44]:
# putting everything together in one function:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from string import punctuation
from collections import defaultdict
from heapq import nlargest

def summarize(text,n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents) # this checks whether the number of summary lines is smaller than the # of sentences in the article
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words("english") + list(punctuation)+['’','“','”','—'])
    
    word_sent = [word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    
    ranking = defaultdict(int)
    
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i]+= freq[w]
    
    sent_index = nlargest(4, ranking,key=ranking.get)
    return [sents[j] for j in sorted(sent_index)]
    
    

In [45]:
summarize(text,3)

['Facebook said on Thursday it has permanently banned several far-right and anti-Semitic figures and organizations, including Nation of Islam leader Louis Farrakhan, Infowars host Alex Jones, Milo Yiannopoulos and Laura Loomer, for being “dangerous,” a sign that the social network is more aggressively enforcing its hate speech policies under pressure from civil rights groups.',
 'Facebook had removed the accounts, fan pages, and groups affiliated with these individuals after it reevaluated the content that they had posted previously, or had reexamined their activities outside of Facebook, the company said.',
 'Angelo Carusone, president of Media Matters, an organization that has long advocated for more enforcement against white supremacists, said Facebook has been lax against enforcing its policies against hate speech on these accounts because the company doesn’t want to deal with the right-wing blowback.',
 'Madihha Ahussain, special counsel for anti-Muslim bigotry with the advocacy g