## Extractive Summarization

Attempt to summarize articles by selecting a subset of words that retain the most important points. Weights the important part of sentences and uses the same to form the summary.

In [1]:
# pip install PyPDF2

In [2]:
import PyPDF2
from PyPDF2 import PdfReader
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [3]:
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as pdf:
        reader = PdfReader(pdf, strict=False)
        pdf_text = []

        for page in reader.pages:
            content = page.extract_text()
            pdf_text.append(content)
    return pdf_text

In [4]:
book='Justice_Sandel.pdf'
first_page = 8

extracted_text = extract_text_from_pdf(book)
extracted_text = extracted_text[first_page:]
raw = ""
for text in extracted_text:
    raw += text
    
# print(raw)

In [5]:
first_line = "1. DOING THE RIGHT THING"
last_line = "a more promising basis for a just society."

start = raw.find(first_line)
end = raw.find(last_line)
print(end)
# print(raw[:528273])
if (start >= 0) & (end > 0):
    extracted_chapter = raw[start:end+len(last_line)]
extracted_chapter


528273


'1. DOING THE RIGHT THING\nIn the summer of 2004,\n Hurricane Charley roared out of the Gulf of Mexico and swept across Florida\nto the Atlantic Ocean. The storm claimed twenty-two lives and caused $11 billion in damage.\n1\n It also left\nin its wake a debate about price gouging.\nAt a gas station in Orlando, they were selling two-dollar bags of ice for ten dollars. Lacking power for\nrefrigerators or air-conditioning in the middle of August, many people had little choice but to pay up.\nDowned trees heightened demand for chain saws and roof repairs. Contractors offered to clear two\ntrees off a homeowner’s roof—for $23,000. Stores that normally sold small household generators for\n$250 were now asking $2,000. A seventy-seven-year-old woman fleeing the hurricane with her elderly\nhusband and handicapped daughter was charged $160 per night for a motel room that normally goes for\n$40.\n2\nMany Floridians were angered by the inflated prices. “After Storm Come the Vultures,” read a headl

In [6]:
chapter = extracted_chapter.replace("\n", " ").split(". ")
# print(chapter)
sentences = []

for sentence in chapter:
#     print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
#     print(sentences)
sentences.pop() 
print(sentences)


[['1'], ['DOING', 'THE', 'RIGHT', 'THING', 'In', 'the', 'summer', 'of', '2004,', '', 'Hurricane', 'Charley', 'roared', 'out', 'of', 'the', 'Gulf', 'of', 'Mexico', 'and', 'swept', 'across', 'Florida', 'to', 'the', 'Atlantic', 'Ocean'], ['The', 'storm', 'claimed', 'twenty-two', 'lives', 'and', 'caused', '$11', 'billion', 'in', 'damage'], ['1', '', 'It', 'also', 'left', 'in', 'its', 'wake', 'a', 'debate', 'about', 'price', 'gouging'], ['At', 'a', 'gas', 'station', 'in', 'Orlando,', 'they', 'were', 'selling', 'two-dollar', 'bags', 'of', 'ice', 'for', 'ten', 'dollars'], ['Lacking', 'power', 'for', 'refrigerators', 'or', 'air-conditioning', 'in', 'the', 'middle', 'of', 'August,', 'many', 'people', 'had', 'little', 'choice', 'but', 'to', 'pay', 'up'], ['Downed', 'trees', 'heightened', 'demand', 'for', 'chain', 'saws', 'and', 'roof', 'repairs'], ['Contractors', 'offered', 'to', 'clear', 'two', 'trees', 'off', 'a', 'homeowner’s', 'roof—for', '$23,000'], ['Stores', 'that', 'normally', 'sold', 's

In [7]:
new_sentences=[]
new_sentences = [[x for x in sentence if x] for sentence in sentences]

# sentences = [x for sentence in sentences for x in sentence if x != '']

print(new_sentences)

[['1'], ['DOING', 'THE', 'RIGHT', 'THING', 'In', 'the', 'summer', 'of', '2004,', 'Hurricane', 'Charley', 'roared', 'out', 'of', 'the', 'Gulf', 'of', 'Mexico', 'and', 'swept', 'across', 'Florida', 'to', 'the', 'Atlantic', 'Ocean'], ['The', 'storm', 'claimed', 'twenty-two', 'lives', 'and', 'caused', '$11', 'billion', 'in', 'damage'], ['1', 'It', 'also', 'left', 'in', 'its', 'wake', 'a', 'debate', 'about', 'price', 'gouging'], ['At', 'a', 'gas', 'station', 'in', 'Orlando,', 'they', 'were', 'selling', 'two-dollar', 'bags', 'of', 'ice', 'for', 'ten', 'dollars'], ['Lacking', 'power', 'for', 'refrigerators', 'or', 'air-conditioning', 'in', 'the', 'middle', 'of', 'August,', 'many', 'people', 'had', 'little', 'choice', 'but', 'to', 'pay', 'up'], ['Downed', 'trees', 'heightened', 'demand', 'for', 'chain', 'saws', 'and', 'roof', 'repairs'], ['Contractors', 'offered', 'to', 'clear', 'two', 'trees', 'off', 'a', 'homeowner’s', 'roof—for', '$23,000'], ['Stores', 'that', 'normally', 'sold', 'small', '

In [8]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
    if np.isnan(1 - cosine_distance(vector1, vector2)):
        return 0
    return 1 - cosine_distance(vector1, vector2)

In [9]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [None]:
stop_words = stopwords.words('english')
# print(stop_words)
summarize_text = []
        
# Step 2 - Generate Similarity Matrix across sentences
sentence_similarity_matrix = build_similarity_matrix(new_sentences, stop_words)

# Step 3 - Rank sentences in similarity matrix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
# print(sentence_similarity_graph)
# print(scores)

# for i,s in enumerate(sentences):
#     print(scores[i],s)

# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(new_sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are ", ranked_sentence)


  return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))


In [None]:
# number of sentences to combine
for i in range(3):
    summarize_text.append(" ".join(ranked_sentence[i][1]))
    # Step 5 - Offcourse, output the summarize texr
print("Summarize Text: \n", ". ".join(summarize_text))

## Topic Modelling