# Extractive Summarization

Attempt to summarize articles by selecting a subset of words that retain the most important points. Weights the important part of sentences and uses the same to form the summary.

In [25]:
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

import preprocess
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.cluster.util import cosine_distance

import numpy as np
import networkx as nx

##  Extract Data

In [97]:
# Define the directory where the books are located
# 5827, 31671
books_directory = "Data/Test/Chapters/65145"

# Create a list to store the text data of each book
book_texts = []
chapters_name = []

# Loop through each file in the directory
for filename in os.listdir(books_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(books_directory, filename), "r", encoding="utf8", errors='ignore') as file:
            book_text = file.read()
            book_texts.append(book_text)
            chapters_name.append(filename.replace('.txt',''))

In [99]:
chapters_name

['I', 'II', 'III', 'IV', 'V', 'VI']

In [149]:
selected_chap = 4
sentences = nltk.sent_tokenize(book_texts[selected_chap])

## Model 1 - Using Sentence Similarity

In [150]:
book_sentences = []
for sen in sentences:
    cleaned_sen = sen.replace("\n", " ")
    book_sentences.append(cleaned_sen.split(" "))

In [151]:
new_sentences=[]
new_sentences = [[x for x in sentence if x] for sentence in book_sentences]
print(new_sentences)

[['V', 'PHILOSOPHIC', 'ANTS:', 'A', 'BIOLOGIC', 'FANTASY', 'PHILOSOPHIC----ANTS?'], ['Amœba', 'has', 'her', 'picture', 'in', 'the', 'book,', 'Proud', 'Protozoon!--Yet', 'beware', 'of', 'pride.'], ['All', 'she', 'can', 'do', 'is', 'fatten', 'and', 'divide;', 'She', 'cannot', 'even', 'read,', 'or', 'sew,', 'or', 'cook....'], ['The', 'Worm', 'can', 'crawl--but', 'has', 'no', 'eyes', 'to', 'look:', 'The', 'Jelly-fish', 'can', 'swim--but', 'lacks', 'a', 'bride:', 'The', 'Fly’s', 'a', 'very', 'Ass', 'personified:', 'And', 'speech', 'is', 'absent', 'even', 'from', 'the', 'Rook.'], ['The', 'Ant', 'herself', 'cannot', 'philosophize--', 'While', 'Man', 'does', 'that,', 'and', 'sees,', 'and', 'keeps', 'a', 'wife,', 'And', 'flies,', 'and', 'talks,', 'and', 'is', 'extremely', 'wise....', 'Will', 'our', 'Philosophy', 'to', 'later', 'Life', 'Seem', 'but', 'a', 'crudeness', 'of', 'the', 'planet’s', 'youth,', 'Our', 'Wisdom', 'but', 'a', 'parasite', 'of', 'Truth?'], ['PHILOSOPHIC', 'ANTS:', 'A', 'BIOLO

In [152]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
    if np.isnan(1 - cosine_distance(vector1, vector2)):
        return 0
    return 1 - cosine_distance(vector1, vector2)

In [153]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [154]:
stop_words = stopwords.words('english')
# print(stop_words)
summarize_text = []
# new_sentences = doc_lemmed
        
# Step 2 - Generate Similarity Matrix across sentences
sentence_similarity_matrix = build_similarity_matrix(new_sentences, stop_words)

# Step 3 - Rank sentences in similarity matrix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
# print(sentence_similarity_graph)
# print(scores)

# for i,s in enumerate(sentences):
#     print(scores[i],s)

# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(new_sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are ", ranked_sentence)


Indexes of top ranked_sentence order are  [(0.012750186381706767, ['It', 'was', 'all', 'strange:', 'but', 'of', 'one', 'thing', 'he', 'was', 'sure--that', 'if', 'only', 'he', 'could', 'find', 'a', 'way', 'of', 'nourishing', 'and', 'maintaining', 'himself', 'in', 'this', 'new', 'state,', 'he', 'would', 'be', 'able,', 'as', 'a', 'child', 'does', 'in', 'the', 'first', 'few', 'years', 'of', 'life,', 'to', 'correlate', 'his', 'puzzling', 'new', 'sensations,', 'and', 'that', 'when', 'he', 'had', 'done', 'this', 'he', 'would', 'obtain', 'a', 'different', 'and', 'more', 'direct', 'view', 'of', 'reality', 'than', 'any', 'he', 'had', 'ever', 'obtained', 'or', 'thought', 'of', 'obtaining', 'before.']), (0.011869537456328919, ['I', 'think', 'the', 'newspaper', 'man', 'would', 'take', 'his', 'opportunity', 'to', 'slink', 'off', 'into', 'the', 'laboratory', 'and', 'get', 'on', 'the', 'machine', 'with', 'the', 'idea', 'of', 'making', 'a', 'scoop', 'for', 'his', 'paper;', '...', 'and', 'then', 'he', '

In [155]:
# number of sentences to combine
summarize_text=[]
for i in range(3):
    summarize_text.append(" ".join(ranked_sentence[i][1]))
    # Step 5 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text).replace('..', '.'))

Summarize Text: 
 It was all strange: but of one thing he was sure--that if only he could find a way of nourishing and maintaining himself in this new state, he would be able, as a child does in the first few years of life, to correlate his puzzling new sensations, and that when he had done this he would obtain a different and more direct view of reality than any he had ever obtained or thought of obtaining before. I think the newspaper man would take his opportunity to slink off into the laboratory and get on the machine with the idea of making a scoop for his paper; .. and then he would put the lever in too violently, and be thrown backwards. If we could manage to follow their history, we should find that after a time they would separate, and seek new partners, of the same or of different species.


## Model 2 - Using LDA Model Topic Distribution

In [156]:
wnl = WordNetLemmatizer()
lower = [[w.lower() for w in nltk.word_tokenize(sen)] for sen in sentences]
punc = [[w for w in sen if re.search('^[a-z]+$', w)] for sen in lower]
doc_lemmed = [[wnl.lemmatize(word, tag[0].lower()) for word, tag in pos_tag(sen, tagset='universal') if tag[0].lower() in ['a', 'r', 'n', 'v']] for sen in punc]

In [157]:
id2word = corpora.Dictionary.load('Model/finalmodel_Dictionary')
# vectorise the words 
chapter = doc_lemmed
# new_dict = gensim.corpora.Dictionary(chapter)
new_vecs = [id2word.doc2bow(sen) for sen in chapter]
print("Number of sentences: ",len(new_vecs))

Number of sentences:  234


In [158]:
#Find most dominant topic for each of the sentences

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

lda_disk = gensim.models.ldamodel.LdaModel.load("Model/finalmodel_5Topics")

def format_topics_sentences(ldamodel, corpus, data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(data)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

#I choose model_list[2] where the number of topics is 6

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_disk, corpus=new_vecs, data=chapter)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Sentence_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
sorted_df = df_dominant_topic.sort_values(by="Topic_Perc_Contrib", ascending = False)

In [159]:
# selected = sorted_df.head(int(sorted_df.shape[0]*0.1))
# selected.head()

In [160]:
# sen_list = sorted(selected['Sentence_No'].to_list())
# sen_list

In [161]:
sen_list = sorted_df['Sentence_No'].to_list()
result_list = []
for i in sen_list[0:4]:
    sentence = sentences[i].replace('\n', ' ')
    result_list.append(sentence)

summary = ' '
summary.join(result_list)
print("Summarize Text: \n" + summary.join(result_list))

Summarize Text: 
If we could conceivably enter into a single inorganic molecule, we should find ourselves one of a moving host of similar objects: and we should further perceive that these objects were themselves complex, some like double stars, others star-clusters, others single suns, and all again built of lesser units held in a definite plan, in an architecture reminding us (if we still had memory) of a solar system _in petto_. Or again, to choose an example that depends more on size than rhythm, how very difficult it is to remember that the pressure of air on our bodies is not the uniform gentle embrace of some homogeneous substance, but the bombardment of an infinity of particles. They turn on all sides      Their shining eyes,      And see below them      The earth and men.”   This being so, what is to prevent us from believing that, once certain adjustments are made in the mental sausage-machine, we shall discover that what we once found impossibly tough meat will pass smoothly