# Extractive Summarization

Attempt to summarize articles by selecting a subset of words that retain the most important points. Weights the important part of sentences and uses the same to form the summary.

In [25]:
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

import preprocess
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.cluster.util import cosine_distance

import numpy as np
import networkx as nx

##  Extract Data

In [26]:
# Define the directory where the books are located
# 5827, 31671
books_directory = "Data/Test/Chapters/5827"

# Create a list to store the text data of each book
book_texts = []
chapters_name = []

# Loop through each file in the directory
for filename in os.listdir(books_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(books_directory, filename), "r", encoding="utf8", errors='ignore') as file:
            book_text = file.read()
            book_texts.append(book_text)
            chapters_name.append(filename.replace('.txt',''))

In [27]:
chapters_name

['CHAPTER_III__THE_NATURE_OF_MATTER',
 'CHAPTER_II__THE_EXISTENCE_OF_MATTER',
 'CHAPTER_IV__IDEALISM',
 'CHAPTER_IX__THE_WORLD_OF_UNIVERSALS',
 'CHAPTER_I__APPEARANCE_AND_REALITY',
 'CHAPTER_VIII__HOW__A_PRIORI__KNOWLEDGE_IS_POSSIBLE',
 'CHAPTER_VII__ON_OUR_KNOWLEDGE_OF_GENERAL_PRINCIPLES',
 'CHAPTER_VI__ON_INDUCTION',
 'CHAPTER_V__KNOWLEDGE_BY_ACQUAINTANCE_AND_KNOWLEDGE_BY_DESCRIPTION',
 'CHAPTER_XIII__KNOWLEDGE,_ERROR,_AND_PROBABLE_OPINION',
 'CHAPTER_XII__TRUTH_AND_FALSEHOOD',
 'CHAPTER_XIV__THE_LIMITS_OF_PHILOSOPHICAL_KNOWLEDGE',
 'CHAPTER_XI__ON_INTUITIVE_KNOWLEDGE',
 'CHAPTER_X__ON_OUR_KNOWLEDGE_OF_UNIVERSALS']

In [28]:
selected_chap = 4
sentences = nltk.sent_tokenize(book_texts[selected_chap])

## Model 1 - Using Sentence Similarity

In [29]:
book_sentences = []
for sen in sentences:
    cleaned_sen = sen.replace("\n", " ")
    book_sentences.append(cleaned_sen.split(" "))

In [30]:
new_sentences=[]
new_sentences = [[x for x in sentence if x] for sentence in book_sentences]
print(new_sentences)

[['CHAPTER', 'I.'], ['APPEARANCE', 'AND', 'REALITY', 'Is', 'there', 'any', 'knowledge', 'in', 'the', 'world', 'which', 'is', 'so', 'certain', 'that', 'no', 'reasonable', 'man', 'could', 'doubt', 'it?'], ['This', 'question,', 'which', 'at', 'first', 'sight', 'might', 'not', 'seem', 'difficult,', 'is', 'really', 'one', 'of', 'the', 'most', 'difficult', 'that', 'can', 'be', 'asked.'], ['When', 'we', 'have', 'realized', 'the', 'obstacles', 'in', 'the', 'way', 'of', 'a', 'straightforward', 'and', 'confident', 'answer,', 'we', 'shall', 'be', 'well', 'launched', 'on', 'the', 'study', 'of', 'philosophy--for', 'philosophy', 'is', 'merely', 'the', 'attempt', 'to', 'answer', 'such', 'ultimate', 'questions,', 'not', 'carelessly', 'and', 'dogmatically,', 'as', 'we', 'do', 'in', 'ordinary', 'life', 'and', 'even', 'in', 'the', 'sciences,', 'but', 'critically,', 'after', 'exploring', 'all', 'that', 'makes', 'such', 'questions', 'puzzling,', 'and', 'after', 'realizing', 'all', 'the', 'vagueness', 'and'

In [31]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
    if np.isnan(1 - cosine_distance(vector1, vector2)):
        return 0
    return 1 - cosine_distance(vector1, vector2)

In [32]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [33]:
stop_words = stopwords.words('english')
# print(stop_words)
summarize_text = []
# new_sentences = doc_lemmed
        
# Step 2 - Generate Similarity Matrix across sentences
sentence_similarity_matrix = build_similarity_matrix(new_sentences, stop_words)

# Step 3 - Rank sentences in similarity matrix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
# print(sentence_similarity_graph)
# print(scores)

# for i,s in enumerate(sentences):
#     print(scores[i],s)

# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(new_sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are ", ranked_sentence)


Indexes of top ranked_sentence order are  [(0.021269226618516497, ['And', 'what', 'we', 'see', 'is', 'constantly', 'changing', 'in', 'shape', 'as', 'we', 'move', 'about', 'the', 'room;', 'so', 'that', 'here', 'again', 'the', 'senses', 'seem', 'not', 'to', 'give', 'us', 'the', 'truth', 'about', 'the', 'table', 'itself,', 'but', 'only', 'about', 'the', 'appearance', 'of', 'the', 'table.']), (0.018794214634999407, ['This', 'colour', 'is', 'not', 'something', 'which', 'is', 'inherent', 'in', 'the', 'table,', 'but', 'something', 'depending', 'upon', 'the', 'table', 'and', 'the', 'spectator', 'and', 'the', 'way', 'the', 'light', 'falls', 'on', 'the', 'table.']), (0.018259934723661053, ['Hence,', 'two', 'very', 'difficult', 'questions', 'at', 'once', 'arise;', 'namely,', '(1)', 'Is', 'there', 'a', 'real', 'table', 'at', 'all?']), (0.01762965941822886, ['I', 'believe', 'that,', 'if', 'any', 'other', 'normal', 'person', 'comes', 'into', 'my', 'room,', 'he', 'will', 'see', 'the', 'same', 'chairs

In [34]:
# number of sentences to combine
summarize_text=[]
for i in range(3):
    summarize_text.append(" ".join(ranked_sentence[i][1]))
    # Step 5 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text).replace('..', '.'))

Summarize Text: 
 And what we see is constantly changing in shape as we move about the room; so that here again the senses seem not to give us the truth about the table itself, but only about the appearance of the table. This colour is not something which is inherent in the table, but something depending upon the table and the spectator and the way the light falls on the table. Hence, two very difficult questions at once arise; namely, (1) Is there a real table at all?


## Model 2 - Using LDA Model Topic Distribution

In [35]:
wnl = WordNetLemmatizer()
lower = [[w.lower() for w in nltk.word_tokenize(sen)] for sen in sentences]
punc = [[w for w in sen if re.search('^[a-z]+$', w)] for sen in lower]
doc_lemmed = [[wnl.lemmatize(word, tag[0].lower()) for word, tag in pos_tag(sen, tagset='universal') if tag[0].lower() in ['a', 'r', 'n', 'v']] for sen in punc]

In [36]:
id2word = corpora.Dictionary.load('Model/finalmodel_Dictionary')
# vectorise the words 
chapter = doc_lemmed
# new_dict = gensim.corpora.Dictionary(chapter)
new_vecs = [id2word.doc2bow(sen) for sen in chapter]
print("Number of sentences: ",len(new_vecs))

Number of sentences:  102


In [37]:
#Find most dominant topic for each of the sentences

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

lda_disk = gensim.models.ldamodel.LdaModel.load("Model/finalmodel_5Topics")

def format_topics_sentences(ldamodel, corpus, data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(data)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

#I choose model_list[2] where the number of topics is 6

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_disk, corpus=new_vecs, data=chapter)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Sentence_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
sorted_df = df_dominant_topic.sort_values(by="Topic_Perc_Contrib", ascending = False)

In [38]:
# selected = sorted_df.head(int(sorted_df.shape[0]*0.1))
# selected.head()

In [39]:
# sen_list = sorted(selected['Sentence_No'].to_list())
# sen_list

In [40]:
sen_list = sorted_df['Sentence_No'].to_list()
result_list = []
for i in sen_list[0:4]:
    sentence = sentences[i].replace('\n', ' ')
    result_list.append(sentence)

summary = ' '
summary.join(result_list)
print("Summarize Text: \n" + summary.join(result_list))

Summarize Text: 
It has appeared that, if we take any common object of the sort that is supposed to be known by the senses, what the senses _immediately_ tell us is not the truth about the object as it is apart from us, but only the truth about certain sense-data which, so far as we can see, depend upon the relations between us and the object. But the sensation we obtain depends upon how hard we press the table and also upon what part of the body we press with; thus the various sensations due to various pressures or various parts of the body cannot be supposed to reveal _directly_ any definite property of the table, but at most to be _signs_ of some property which perhaps _causes_ all the sensations, but is not actually apparent in any of them. In fact, almost all philosophers seem to be agreed that there is a real table: they almost all agree that, however much our sense-data--colour, shape, smoothness, etc.--may depend upon us, yet their occurrence is a sign of something existing ind