# Text Summarization

In [1]:
import urllib.request  
import bs4 as BeautifulSoup
import nltk
import re
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize 
from heapq import nlargest

In [None]:
#nltk.download("stopwords")

In [2]:
# Find the number links by looking on Project Gutenberg in the address bar for a book.
books = {'Pride and Prejudice': '1342'}

book = books['Pride and Prejudice']

# Load text from Project Gutenberg URL
import requests
url_template = 'https://www.gutenberg.org/cache/epub/%s/pg%s.txt'

response = requests.get(url_template % (book, book), 'r')
text = response.text

# See the number of characters and the first 60 characters to confirm it is there    
print(len(text), ',', text[:60] , '...')

717572 , ï»¿The Project Gutenberg EBook of Pride and Prejudice, by Jane ...


In [3]:
import re
words = re.split('[^A-Za-z]+', text.lower())
words = list(filter(None, words)) # Remove empty strings

# Print length of list
print(len(words))

125897


In [4]:
# Parsing the URL content 
book_parsed = BeautifulSoup.BeautifulSoup(text,'html.parser')

In [5]:
#Pretify our beautiful soup book and get it back as string but easy to be read
book = book_parsed.prettify()

In [6]:
stop_words = stopwords.words('english')

In [7]:
#Function to get chapter from book
def getChapter( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

In [None]:
#The concept is that we will try to get each chapter,then tokenize the words, find the frequency of each word, build sentences from these rules.

In [9]:
#Iterate through the book chapters (61) so as to get summary of each chapter.We will store them in a dictionary with key the chapter index and value the chapter content itself.
dictChapters={'key': 'value'}

for i in range(1,61):
    chapter = getChapter(book,"Chapter " + str(i),"Chapter " + str(i+1))
    dictChapters[i] = chapter

In [11]:
word_frequencies = {}
sentence_scores = {}
counter=0
sent2score = {}
dictSummary={'key': 'value'}

for chapter_no,chapter in dictChapters.items():
    #print("******* Chapter " + str(chapter_no) + " *******")
    counter+=1
    tokens = word_tokenize(chapter)
    punctuation = punctuation + '\n'
    for word in tokens: 
        #Calculate word frequency   
        if word.lower() not in stop_words:
            if word.lower() not in punctuation:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word]/max(word_frequencies.values())
    #Tokenize sentences
    sent_token = sent_tokenize(chapter)
    for sentence in sent_token:
        #Score each sentence
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_frequencies.keys():
                if len(sentence.split(' ')) < 25:
                    if sentence not in sent2score.keys():
                        sent2score[sentence] = word_frequencies[word]
                    else:
                        sent2score[sentence] += word_frequencies[word]

    # We will keep the 1/3 as sentence length
    select_length = int(len(sent_token)*0.3)

    #Use nlargest method (part of heap queue priority algorithm) to get best sentences based on frequency score 
    summary = nlargest(select_length, sent2score, key = sent2score.get)

    #Store each summary to a dictionary for later use
    dictSummary[counter] = summary

In [12]:
#Get all summary chapters one by one
for chapter,summ in dictSummary.items():
    print("******* Chapter " + str(chapter) + " *******")
    print(" ")
    print(summ)
    print(" ")

t, thin sort of inclination, I\r\nam convinced that one good sonnet will starve it entirely away."', '"Perhaps," said Darcy, "I should have judged better, had I sought an\r\nintroduction; but I am ill-qualified to recommend myself to strangers."', '"Let me first see how he behaves," said she; "it will then be early\r\nenough for expectation."', '"And that," said Mrs. Reynolds, pointing to another of the miniatures,\r\n"is my master--and very like him.', '"You are considering how insupportable it would be to pass many evenings\r\nin this manner--in such society; and indeed I am quite of your opinion.', '"That is not an unnatural surmise," said Fitzwilliam, "but it is a\r\nlessening of the honour of my cousin\'s triumph very sadly."', '"To walk three miles, or four miles, or five miles, or whatever it is,\r\nabove her ankles in dirt, and alone, quite alone!']
 
******* Chapter 58 *******
 
['"Oh!', '"Kitty has no discretion in her coughs," said her father; "she times\r\nthem ill."\r\n\r\

In [13]:
#Retrieve summary by providing the chapter number to dictSummary dictionary (Example Summary of Chapter 7)
dictSummary[7]

['"Kitty has no discretion in her coughs," said her father; "she times\r\nthem ill."\r\n\r\n"I do not cough for my own amusement," replied Kitty fretfully.',
 "The rest of the evening was spent in conjecturing how soon he would\r\nreturn Mr. Bennet's visit, and determining when they should ask him to\r\ndinner.",
 'said Lydia stoutly, "I am not afraid; for though I _am_ the\r\nyoungest, I\'m the tallest."',
 '"But I hope you will get over it, and live to see many young men of four\r\nthousand a year come into the neighbourhood."',
 '"We are not in a way to know _what_ Mr. Bingley likes," said her mother\r\nresentfully, "since we are not to visit."',
 'Miss Bennet was therefore\r\nestablished as a sweet girl, and their brother felt authorized by such\r\ncommendation to think of her as he chose.',
 '"That is very true," replied Elizabeth, "and I could easily forgive\r\n_his_ pride, if he had not mortified _mine_."',
 '"He is, indeed; but, considering the inducement, my dear Miss Eliza,\r

In [14]:
#Summary of Chapter 23
dictSummary[23]

['"Oh!',
 '"Kitty has no discretion in her coughs," said her father; "she times\r\nthem ill."\r\n\r\n"I do not cough for my own amusement," replied Kitty fretfully.',
 "The rest of the evening was spent in conjecturing how soon he would\r\nreturn Mr. Bennet's visit, and determining when they should ask him to\r\ndinner.",
 'said Lydia stoutly, "I am not afraid; for though I _am_ the\r\nyoungest, I\'m the tallest."',
 '"But I hope you will get over it, and live to see many young men of four\r\nthousand a year come into the neighbourhood."',
 '"You used us abominably ill," answered Mrs. Hurst, "running away without\r\ntelling us that you were coming out."',
 '"We are not in a way to know _what_ Mr. Bingley likes," said her mother\r\nresentfully, "since we are not to visit."',
 'Miss Bennet was therefore\r\nestablished as a sweet girl, and their brother felt authorized by such\r\ncommendation to think of her as he chose.',
 '"There is, I believe, in every disposition a tendency to some pa