# Text Summarization - Support Both English & Chinese Inputs
### www.KudosData.com
#### By: Sam GU Zhan
#### March, 2017

### A possible error: Failed loading english.pickle with nltk.data.load

Resource 'tokenizers/punkt/english.pickle' not found.  Please use the NLTK Downloader to obtain the resource:  >>>

http://stackoverflow.com/questions/4867197/failed-loading-english-pickle-with-nltk-data-load

In [13]:
# import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     G:\Tool_PGM\Anaconda3\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

In [2]:
min_cut = 0.01
min_cut = 0.99

In [3]:
class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        """
        Initilize the text summarizer.
        Words that have a frequency term lower than min_cut 
        or higer than max_cut will be ignored.
        """
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))

    def _compute_frequencies(self, word_sent):
            """ 
               Compute the frequency of each of word.
               Input: 
                word_sent, a list of sentences already tokenized.
               Output: 
                freq, a dictionary where freq[w] is the frequency of w.
             """
            freq = defaultdict(int)
            for s in word_sent:
                 for word in s:
                    if word not in self._stopwords:
                         freq[word] += 1
        # frequencies normalization and fitering
            m = float(max(freq.values()))
# Python 2->3 conversion            
#             for w in freq.keys():
            for w in list(freq):
                freq[w] = freq[w]/m
                if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                      del freq[w]
            return freq
    
    def summarize(self, text, n):
        """
        Return a list of n sentences 
        which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)    
        return [sents[j] for j in sents_idx]

    def _rank(self, ranking, n):
        """ return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)

In [4]:
# import urllib2
from urllib.request import urlopen
from bs4 import BeautifulSoup

def get_only_text(url):
    """ 
    return the title and the text of the article
    at the specified url
    """
#     page = urllib2.urlopen(url).read().decode('utf8')
    page = urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page, "lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return soup.title.text, text

In [5]:
feed_xml = urlopen('http://feeds.bbci.co.uk/news/rss.xml').read()
feed = BeautifulSoup(feed_xml.decode('utf8'), "lxml")
# to_summarize = map(lambda p: p.text, feed.find_all('guid'))
to_summarize = list(map(lambda p: p.text, feed.find_all('guid')))

In [6]:
to_summarize

['http://www.bbc.co.uk/news/world-us-canada-39388815',
 'http://www.bbc.co.uk/news/uk-39377883',
 'http://www.bbc.co.uk/news/world-middle-east-39383989',
 'http://www.bbc.co.uk/news/uk-politics-39380606',
 'http://www.bbc.co.uk/news/entertainment-arts-39355410',
 'http://www.bbc.co.uk/news/world-europe-39383988',
 'http://www.bbc.co.uk/news/business-39383535',
 'http://www.bbc.co.uk/news/education-39381899',
 'http://www.bbc.co.uk/news/uk-england-hereford-worcester-39380779',
 'http://www.bbc.co.uk/news/uk-england-stoke-staffordshire-39379053',
 'http://www.bbc.co.uk/news/world-middle-east-39382250',
 'http://www.bbc.co.uk/news/uk-england-kent-39377133',
 'http://www.bbc.co.uk/news/uk-39385125',
 'http://www.bbc.co.uk/news/blogs-the-papers-39388352',
 'http://www.bbc.co.uk/news/uk-39355108',
 'http://www.bbc.co.uk/news/magazine-39346157',
 'http://www.bbc.co.uk/sport/39360295',
 'http://www.bbc.co.uk/news/uk-england-lancashire-39386044',
 'http://www.bbc.co.uk/news/world-us-canada-3937

### Note: Below step seems require internet access !

In [8]:
fs = FrequencySummarizer()
for article_url in to_summarize[:2]:
    title, text = get_only_text(article_url)
    print ('----------------------------------')
    print (title)
    for s in fs.summarize(text, 2):
        print ('*',s)

----------------------------------
Trump blames Democrats for failed healthcare bill - BBC News
* Speaking to the Washington Post, Mr Trump said "We couldn't get one Democratic vote, and we were a little bit shy, very little, but it was still a little bit shy, so we pulled it."
* "We have to let Obamacare go its own way for a little while," he told reporters at the Oval Office, adding that if the Democrats were "civilised and came together" the two parties could work out a "great healthcare bill".
----------------------------------
London attack: Police appeal for information on Khalid Masood - BBC News
* Those still in custody are: Earlier, in appealing for information from the public, Mark Rowley, of the Metropolitan Police, said they would investigate whether Masood "acted totally alone inspired by terrorist propaganda, or if others have encouraged, supported or directed him".
* Mr Toumi said Masood had been "friendly and smiley", while the hotel receptionist noted on their system h