In [1]:
print('Text summary with NLTK: Word frequency Algorithm')

Text summary with NLTK: Word frequency Algorithm


In [2]:
import sys
print(sys.executable)

/opt/anaconda3/bin/python


In [3]:
!pip install --upgrade pip



In [4]:
# web scrapping dependencies
!pip install beautifulsoup4



In [5]:
# dependencies for text processing

In [6]:
!pip install nltk



In [7]:
import bs4 as bs
import urllib.request
import re
import nltk

In [8]:
# fetch data content from url and read
fetched_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/A_Game_of_Thrones')

article = fetched_data.read()
print('first {0} characters of the article: {1}'.format(100, article[:100]))

first 100 characters of the article: b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'


In [9]:
# parse article with either html parser/lxml/xml
parsed_article = bs.BeautifulSoup(article,'html.parser')

In [10]:
# select all <p> tags, discard the rest
article_paragraphs = parsed_article.find_all('p')

In [11]:
print(type(article_paragraphs))

<class 'bs4.element.ResultSet'>


In [12]:
article_content = ''.join([paragraph.text for paragraph in article_paragraphs])
print('first {0} characters of the article_conetnt: {1}'.format(100, article_content[:100]))

first 100 characters of the article_conetnt: A Game of Thrones is the first novel in A Song of Ice and Fire, a series of fantasy novels by the Am


In [13]:
# replace sqaure brackets w/w/o references: [1],[] w/ extra spaces
article_content = re.sub(r'\[[0-9]*\]', ' ', article_content)
# replace extra spaces
article_content = re.sub(r'\s+', ' ', article_content)
# replace digits and special char w/ extra spaces
formatted_article_content = re.sub('[^a-zA-Z]', ' ', article_content )
formatted_article_content = re.sub(r'\s+', ' ', article_content)

print('first {0} characters of the article_conetnt: {1}'.format(100, article_content[:100]))
print('last {0} characters of the article_conetnt: {1}'.format(100, article_content[-100:]))

first 100 characters of the article_conetnt: A Game of Thrones is the first novel in A Song of Ice and Fire, a series of fantasy novels by the Am
last 100 characters of the article_conetnt: mber 5, 2019, the BBC News listed A Game of Thrones on its list of the 100 most influential novels. 


In [14]:
# get sentences w/ nltk
sentence_list = nltk.sent_tokenize(article_content)
stopwords = nltk.corpus.stopwords.words('english')

In [15]:
print('first {0} sentences of the sentence_list: {1}'.format(5, sentence_list[:5]))
# print(len(sentence_list))

first 5 sentences of the sentence_list: ['A Game of Thrones is the first novel in A Song of Ice and Fire, a series of fantasy novels by the American author George R. R. Martin.', 'It was first published on August 1, 1996.', 'The novel won the 1997 Locus Award and was nominated for both the 1997 Nebula Award and the 1997 World Fantasy Award.', 'The novella Blood of the Dragon, comprising the Daenerys Targaryen chapters from the novel, won the 1997 Hugo Award for Best Novella.', 'In January 2011, the novel became a New York Times Bestseller and reached No.']


In [16]:
# find the frequency of occurrence of each word excluding stop words
def create_word_frequency_table(text) -> dict:
    # stopwords to ignore
    stop_words = set(stopwords)
    # get words from the provided text string
    words = nltk.word_tokenize(text)
    # normalise w/ stemmer to avoid redundant meanings
    ps = nltk.stem.PorterStemmer()
    
    # create word frequencies
    # skip stop words
    # increase value/freq on each 
    # successful key search iteration
    word_frequency_table = {}
    for word in words:
        word = ps.stem(word)
        if word in stop_words:
            continue
        if word in word_frequency_table:
            word_frequency_table[word] += 1
        else:
            word_frequency_table[word] = 1
    
    return word_frequency_table

In [17]:
# calc frequency of words in each sentence, exclude stop words
# calc sentence score by adding up frequency of its words
def calc_sentence_scores(senetence_list, word_frequency_table) -> dict:
    sentence_scores = {}

    for sentence in senetence_list:
        
        num_of_words = len(nltk.word_tokenize(sentence))
        
        for word in word_frequency_table:
            if word in sentence.lower():
                if sentence[:10] in sentence_scores:
                    sentence_scores[sentence[:10]] += word_frequency_table[word]
                else:
                    sentence_scores[sentence[:10]] = word_frequency_table[word]

    return sentence_scores

In [18]:
# calc avg sentence score of the available scores
def find_threshold_score(sentence_scores) -> int:
    return sum(sentence_scores.values()) / len(sentence_scores)

In [19]:
# concatenate sentences whose score is greater than the provided threshold
def generate_summary(sentence_list, sentence_scores, threshold):
    return ''.join([sentence + ' ' for sentence in sentence_list if sentence[:10] in sentence_scores and sentence_scores[sentence[:10]] > threshold])

In [20]:
word_frequency_table = create_word_frequency_table(formatted_article_content)
sentence_scores = calc_sentence_scores(sentence_list, word_frequency_table)
threshold = find_threshold_score(sentence_scores)
print('Summary:\n', generate_summary(sentence_list, sentence_scores, threshold))

Summary:
 A Game of Thrones is the first novel in A Song of Ice and Fire, a series of fantasy novels by the American author George R. R. Martin. The novel won the 1997 Locus Award and was nominated for both the 1997 Nebula Award and the 1997 World Fantasy Award. In the novel, recounting events from various points of view, Martin introduces the plot-lines of the noble houses of Westeros, the Wall, and the Targaryens. The novel has inspired several spin-off works, including several games. It is also the namesake and basis for the first season of Game of Thrones, an HBO television series that premiered in April 2011. A Game of Thrones follows three principal storylines simultaneously. Upon the death of Lord Jon Arryn, the principal advisor to King Robert Baratheon, Robert recruits his childhood friend Eddard "Ned" Stark, now Lord of the North, to replace Arryn as Hand of the King, and to betroth his daughter Sansa to Robert's son Joffrey. Shortly thereafter, Ned's son Bran discovers Cerse