# Extractive Summarization

Attempt to summarize articles by selecting a subset of words that retain the most important points. Weights the important part of sentences and uses the same to form the summary.

In [1]:
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

import preprocess
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.cluster.util import cosine_distance

import numpy as np
import networkx as nx

##  Preprocessing

In [2]:
# Define the directory where the books are located
# 5827, 31671
books_directory = "Data/Test/Chapters/5827"

# Create a list to store the text data of each book
book_texts = []
chapters_name = []

# Loop through each file in the directory
for filename in os.listdir(books_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(books_directory, filename), "r", encoding="utf8", errors='ignore') as file:
            book_text = file.read()
            book_texts.append(book_text)
            chapters_name.append(filename.replace('.txt',''))

In [17]:
chapters_name

['CHAPTER_III__THE_NATURE_OF_MATTER',
 'CHAPTER_II__THE_EXISTENCE_OF_MATTER',
 'CHAPTER_IV__IDEALISM',
 'CHAPTER_IX__THE_WORLD_OF_UNIVERSALS',
 'CHAPTER_I__APPEARANCE_AND_REALITY',
 'CHAPTER_VIII__HOW__A_PRIORI__KNOWLEDGE_IS_POSSIBLE',
 'CHAPTER_VII__ON_OUR_KNOWLEDGE_OF_GENERAL_PRINCIPLES',
 'CHAPTER_VI__ON_INDUCTION',
 'CHAPTER_V__KNOWLEDGE_BY_ACQUAINTANCE_AND_KNOWLEDGE_BY_DESCRIPTION',
 'CHAPTER_XIII__KNOWLEDGE,_ERROR,_AND_PROBABLE_OPINION',
 'CHAPTER_XII__TRUTH_AND_FALSEHOOD',
 'CHAPTER_XIV__THE_LIMITS_OF_PHILOSOPHICAL_KNOWLEDGE',
 'CHAPTER_XI__ON_INTUITIVE_KNOWLEDGE',
 'CHAPTER_X__ON_OUR_KNOWLEDGE_OF_UNIVERSALS']

In [7]:
selected_chap = 4
sentences = nltk.sent_tokenize(book_texts[selected_chap])

In [9]:
sentences

['\nCHAPTER I.',
 'APPEARANCE AND REALITY\n\nIs there any knowledge in the world which is so certain that no\nreasonable man could doubt it?',
 'This question, which at first sight might\nnot seem difficult, is really one of the most difficult that can\nbe asked.',
 'When we have realized the obstacles in the way of a\nstraightforward and confident answer, we shall be well launched on the\nstudy of philosophy--for philosophy is merely the attempt to answer\nsuch ultimate questions, not carelessly and dogmatically, as we do in\nordinary life and even in the sciences, but critically, after exploring\nall that makes such questions puzzling, and after realizing all the\nvagueness and confusion that underlie our ordinary ideas.',
 'In daily life, we assume as certain many things which, on a closer\nscrutiny, are found to be so full of apparent contradictions that only a\ngreat amount of thought enables us to know what it is that we really may\nbelieve.',
 'In the search for certainty, it is

In [8]:
book_texts[selected_chap]

"\nCHAPTER I. APPEARANCE AND REALITY\n\nIs there any knowledge in the world which is so certain that no\nreasonable man could doubt it? This question, which at first sight might\nnot seem difficult, is really one of the most difficult that can\nbe asked. When we have realized the obstacles in the way of a\nstraightforward and confident answer, we shall be well launched on the\nstudy of philosophy--for philosophy is merely the attempt to answer\nsuch ultimate questions, not carelessly and dogmatically, as we do in\nordinary life and even in the sciences, but critically, after exploring\nall that makes such questions puzzling, and after realizing all the\nvagueness and confusion that underlie our ordinary ideas.\n\nIn daily life, we assume as certain many things which, on a closer\nscrutiny, are found to be so full of apparent contradictions that only a\ngreat amount of thought enables us to know what it is that we really may\nbelieve. In the search for certainty, it is natural to begin 

## Model 1 - Using Sentence Similarity

In [19]:
book_sentences = []
for sen in sentences:
    cleaned_sen = sen.replace("\n", " ")
    book_sentences.append(cleaned_sen.split(" "))

book_sentences[0][0:20]

['', 'CHAPTER', 'I.']

In [20]:
new_sentences=[]
new_sentences = [[x for x in sentence if x] for sentence in book_sentences]
print(new_sentences)

[['CHAPTER', 'I.'], ['APPEARANCE', 'AND', 'REALITY', 'Is', 'there', 'any', 'knowledge', 'in', 'the', 'world', 'which', 'is', 'so', 'certain', 'that', 'no', 'reasonable', 'man', 'could', 'doubt', 'it?'], ['This', 'question,', 'which', 'at', 'first', 'sight', 'might', 'not', 'seem', 'difficult,', 'is', 'really', 'one', 'of', 'the', 'most', 'difficult', 'that', 'can', 'be', 'asked.'], ['When', 'we', 'have', 'realized', 'the', 'obstacles', 'in', 'the', 'way', 'of', 'a', 'straightforward', 'and', 'confident', 'answer,', 'we', 'shall', 'be', 'well', 'launched', 'on', 'the', 'study', 'of', 'philosophy--for', 'philosophy', 'is', 'merely', 'the', 'attempt', 'to', 'answer', 'such', 'ultimate', 'questions,', 'not', 'carelessly', 'and', 'dogmatically,', 'as', 'we', 'do', 'in', 'ordinary', 'life', 'and', 'even', 'in', 'the', 'sciences,', 'but', 'critically,', 'after', 'exploring', 'all', 'that', 'makes', 'such', 'questions', 'puzzling,', 'and', 'after', 'realizing', 'all', 'the', 'vagueness', 'and'

In [21]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
    if np.isnan(1 - cosine_distance(vector1, vector2)):
        return 0
    return 1 - cosine_distance(vector1, vector2)

In [22]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [23]:
stop_words = stopwords.words('english')
# print(stop_words)
summarize_text = []
        
# Step 2 - Generate Similarity Matrix across sentences
sentence_similarity_matrix = build_similarity_matrix(new_sentences, stop_words)

# Step 3 - Rank sentences in similarity matrix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
# print(sentence_similarity_graph)
# print(scores)

# for i,s in enumerate(sentences):
#     print(scores[i],s)

# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(new_sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are ", ranked_sentence)


Indexes of top ranked_sentence order are  [(0.021269226618516497, ['And', 'what', 'we', 'see', 'is', 'constantly', 'changing', 'in', 'shape', 'as', 'we', 'move', 'about', 'the', 'room;', 'so', 'that', 'here', 'again', 'the', 'senses', 'seem', 'not', 'to', 'give', 'us', 'the', 'truth', 'about', 'the', 'table', 'itself,', 'but', 'only', 'about', 'the', 'appearance', 'of', 'the', 'table.']), (0.018794214634999407, ['This', 'colour', 'is', 'not', 'something', 'which', 'is', 'inherent', 'in', 'the', 'table,', 'but', 'something', 'depending', 'upon', 'the', 'table', 'and', 'the', 'spectator', 'and', 'the', 'way', 'the', 'light', 'falls', 'on', 'the', 'table.']), (0.01825993472366105, ['Hence,', 'two', 'very', 'difficult', 'questions', 'at', 'once', 'arise;', 'namely,', '(1)', 'Is', 'there', 'a', 'real', 'table', 'at', 'all?']), (0.01762965941822886, ['I', 'believe', 'that,', 'if', 'any', 'other', 'normal', 'person', 'comes', 'into', 'my', 'room,', 'he', 'will', 'see', 'the', 'same', 'chairs'

In [24]:
# number of sentences to combine
for i in range(3):
    summarize_text.append(" ".join(ranked_sentence[i][1]))
    # Step 5 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 And what we see is constantly changing in shape as we move about the room; so that here again the senses seem not to give us the truth about the table itself, but only about the appearance of the table.. This colour is not something which is inherent in the table, but something depending upon the table and the spectator and the way the light falls on the table.. Hence, two very difficult questions at once arise; namely, (1) Is there a real table at all?


## Model 2 (yf) - Using LDA Model Topic Distribution

In [27]:
wnl = WordNetLemmatizer()
lower = [[w.lower() for w in nltk.word_tokenize(sen)] for sen in sentences]
punc = [[w for w in sen if re.search('^[a-z]+$', w)] for sen in lower]
doc_lemmed = [[wnl.lemmatize(word, tag[0].lower()) for word, tag in pos_tag(sen, tagset='universal') if tag[0].lower() in ['a', 'r', 'n', 'v']] for sen in punc]

In [28]:
id2word = corpora.Dictionary.load('Model/finalmodel_Dictionary')
# vectorise the words 
chapter = doc_lemmed
# new_dict = gensim.corpora.Dictionary(chapter)
new_vecs = [id2word.doc2bow(sen) for sen in chapter]
print("Number of sentences: ",len(new_vecs))

Number of sentences:  102


In [29]:
#Find most dominant topic for each of the sentences

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

lda_disk = gensim.models.ldamodel.LdaModel.load("Model/finalmodel_5Topics")

def format_topics_sentences(ldamodel, corpus, data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(data)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

#I choose model_list[2] where the number of topics is 6

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_disk, corpus=new_vecs, data=chapter)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Sentence_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
sorted_df = df_dominant_topic.sort_values(by="Topic_Perc_Contrib", ascending = False)

In [30]:
selected = sorted_df.head(int(sorted_df.shape[0]*0.1))
selected.head()

Unnamed: 0,Sentence_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
54,54,2,0.9921,"sensation, image, belief, consciousness, physi...","[thus, whenever, see, colour, have, sensation,..."
63,63,2,0.9921,"sensation, image, belief, consciousness, physi...","[philosopher, first, bring, prominently, forwa..."
91,91,2,0.9921,"sensation, image, belief, consciousness, physi...","[have, appear, that, if, take, common, object,..."
67,67,2,0.9901,"sensation, image, belief, consciousness, physi...","[berkeley, retain, merit, of, have, show, that..."
10,10,4,0.9901,"dream, woman, child, sexual, girl, denote, lov...","[i, believe, that, if, other, normal, person, ..."


In [31]:
sen_list = sorted(selected['Sentence_No'].to_list())
sen_list

[10, 13, 27, 52, 54, 55, 58, 63, 67, 91]

In [32]:
result_list = []
for i in sen_list[0:4]:
    sentence = sentences[i].replace('\n', ' ')
    result_list.append(sentence)

summary = ' '
summary.join(result_list)
print("Summarize Text: \n" + summary.join(result_list))

Summarize Text: 
I believe that, if any other normal person comes into my room, he will see the same chairs and tables and books and papers as I see, and that the table which I see is the same as the table which I feel pressing against my arm. To make our difficulties plain, let us concentrate attention on the table. But the other colours which appear under other conditions have just as good a right to be considered real; and therefore, to avoid favouritism, we are compelled to deny that, in itself, the table has any one particular colour. Let us give the name of 'sense-data' to the things that are immediately known in sensation: such things as colours, sounds, smells, hardnesses, roughnesses, and so on.
