In [30]:
# Simple preprocessing of url with pad sequences
from bs4 import BeautifulSoup
from urllib.request import urlopen
from nltk.tokenize import sent_tokenize
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from pprint import pprint

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


def get_only_text(url):
    """ 
    Return title and text of article defined 
    as url
    """
    page = urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
  
    print ("=====================")
    print (text)
    print ("=====================")
 
    return soup.title.text, text

url = "https://www.iep.utm.edu/brainvat/"
text = get_only_text(url)

# Split by sentences 
sentences = []
for s in text:
    sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x]

print(len(sentences))
print(sentences[:10])

train_sentences = sentences[:130]
test_sentences = sentences[131:]

tokenizer = Tokenizer(num_words=1000, oov_token="OOV")
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# Loop through words
from collections import defaultdict

def sort_it(_dict, n):
    result = defaultdict(list)
    for name, num in _dict.items():
        result[num].append(name)
    return sorted(result.items(), reverse=False)[:n]

top3 = sort_it(word_index, 10)
print(top3)

sequences = tokenizer.texts_to_sequences(test_sentences)
padded = pad_sequences(sequences, maxlen=8)


('The Brain in a Vat thought-experiment is most commonly used to illustrate '
 'global or Cartesian skepticism.  You are told to imagine the possibility '
 'that at this very moment you are actually a brain hooked up to a '
 'sophisticated computer program that can perfectly simulate experiences of '
 'the outside world.  Here is the skeptical argument.  If you cannot now be '
 'sure that you are not a brain in a vat, then you cannot rule out the '
 'possibility that all of your beliefs about the external world are false.   '
 'Or, to put it in terms of knowledge claims, we can construct the following '
 'skeptical argument.  Let “P” stand for any belief or claim about the '
 'external world, say, that snow is white. The Brain in a Vat Argument is '
 'usually taken to be a modern version   of René Descartes’ argument (in the '
 'Meditations on First Philosophy) that centers on the possibility of an evil '
 'demon who systematically deceives us.  The hypothesis has been the premise '
 '

In [38]:
# Summarize text using Gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim.summarization import summarize
from gensim.summarization import keywords
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

text = requests.get('https://www.iep.utm.edu/brainvat/').text

TextRank is graph based, ranking algorithm for text processing. In general, text based ranking algorithms are deciding on importance of a vertex of a graph by taking into account global information from entire graph. Connection from one vertex of a graph to another is based on voting. The higher number of votes the higher importance of vertex on whole graph. Algorithm takes initial, arbitrary value,  iterates through each node in the graph, assign score to each vertex that assign importance of the vertex to whole graph. Final score is independent from initial, assigned value. 

For more information go to:

https://arxiv.org/abs/1602.03606

In [39]:
print('Summary:')
print(summarize(text, ratio=0.01))

print('\nKeywords:')
print(keywords(text, ratio=0.01))

2020-03-08 13:01:43,606 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-03-08 13:01:43,614 : INFO : built Dictionary(878 unique tokens: ['argument', 'encyclopedia', 'internet', 'philosophi', 'vat\x94']...) from 412 documents (total 2960 corpus positions)
2020-03-08 13:01:43,618 : INFO : Building graph
2020-03-08 13:01:43,619 : INFO : Filling graph
2020-03-08 13:01:43,734 : INFO : Removing unreachable nodes of graph
2020-03-08 13:01:43,736 : INFO : Pagerank graph


'Summary:'


2020-03-08 13:01:43,776 : INFO : Sorting pagerank scores


('In his <em>Reason, Truth and History </em>(1981), Hilary Putnam first '
 'presented the argument that we cannot be brains in a vat, which has since '
 'given rise to a large discussion with repercussions for the realism debate '
 'and for central theses in the philosophy of language and mind.\n'
 'For Putnam’s metaphysical realist will also agree that truth and reality '
 'cannot be subject to “epistemically derived constraints.”  This general '
 'characterization of metaphysical realism is enough to provide a target for '
 'the Brains in a Vat argument.\n'
 '<p>On this construal of the truth-conditions, “We are brains in a vat” as '
 'uttered by a BIV would presumably be <em>false</em>, since a brain in a vat '
 'would <em>not</em> have sense-impressions of being a brain in a vat: recall '
 'a BIV’s notional world would be equivalent to the unenvatted, and he would '
 'appear to himself to be a normally embodied person with a real body etc.\n'
 'The metaphysical realist can claim th

In [32]:
# Summarize the text with sumy
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
 
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
 
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer

In [33]:
LANGUAGE = "english"
SENTENCES_COUNT = 10

In [35]:
url='https://www.iep.utm.edu/brainvat/'
parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))


Latent semantic analysis (LSA) is a technique in natural language processing, in particular distributional semantics, of analyzing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms. LSA assumes that words that are close in meaning will occur in similar pieces of text (the distributional hypothesis). A matrix containing word counts per document (rows represent unique words and columns represent each document) is constructed from a large piece of text and a mathematical technique called singular value decomposition (SVD) is used to reduce the number of rows while preserving the similarity structure among columns. Documents are then compared by taking the cosine of the angle between the two vectors (or the dot product between the normalizations of the two vectors) formed by any two columns. Values close to 1 represent very similar documents while values close to 0 represent very dissimilar documents.

For more information go to:

https://github.com/iamprem/summarizer
http://www.kiv.zcu.cz/~jstein/publikace/isim2004.pdf


In [36]:
print ("--LsaSummarizer--")    
summarizer = LsaSummarizer()
summarizer = LsaSummarizer(Stemmer(LANGUAGE))
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

'--LsaSummarizer--'
<Sentence: The hypothesis has been the premise behind the movie The Matrix , in which the entire human race has been placed into giant vats and fed a virtual reality at the hands of malignant artificial intelligence (our own creations, of course).>
<Sentence: In his Reason, Truth and History (1981), Hilary Putnam first presented the argument that we cannot be brains in a vat, which has since given rise to a large discussion with repercussions for the realism debate and for central theses in the philosophy of language and mind.>
<Sentence: As Thomas Nagel puts it, “realism makes skepticism intelligible,” (1986, 73) because once we open the gap between truth and epistemology, we must countenance the possibility that all of our beliefs, no matter how well justified, nevertheless fail to accurately depict the world as it really is.>
<Sentence: The Brain in a Vat scenario is just an illustration of this kind of global skepticism: it depicts a situation where all our beli


Luhn introduced a method to extract salient sentences from the text using features such as word and phrase frequency. They proposed to weight the sentences of a document as a function of high frequency words, ignoring very high frequency common words.

Algorithm psuedocode :

1) Calculate signficant words in the text by means of a min and maximum
   ratio of occurence i.e ignore most frequent words and least frequent ones.  
2) For each sentence in the text calculate its weight based on the number of keywords squared
   divided by the windows size which is the maximum distance between two significant words.  
3) sort sentences in descending order based on their weight and output the first n of them.

For more information go to:

https://github.com/mohammadKhalifa/Luhn-s-summarizer
https://arxiv.org/pdf/1707.02268.pdf


In [40]:
print ("--LuhnSummarizer--")     
summarizer = LuhnSummarizer() 
summarizer = LuhnSummarizer(Stemmer(LANGUAGE))
summarizer.stop_words = ("I", "am", "the", "you", "are", "me", "is", "than", "that", "this")
for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

'--LuhnSummarizer--'
<Sentence: There are, however, many self-professed metaphysical realists who are not happy with Putnam’s definition; it saddles the realist with the classical difficulty of matching words to objects and of providing for a correspondence relation between sentences and mind-independent “facts.” The metaphysical realist is forced to construe her thesis ontologically, as an adherence to some fixed furniture of objects in the world, which ignores the possibility that ontological commitment may be specified not as a commitment to a set of entities but rather to the truth of a class of sentences or even of whole theories of the world.>
<Sentence: A brain in a vat however would not be able to refer to trees since there are no trees (and even if there were trees there would not be the appropriate causal relation between its tokenings of “tree” and real trees, unless we bring back the standard fantasy and assume it picked up the terms from the mad scientist).>
<Sentence: Ass


Edmundson stressed the importance of title-words for summarization and was the first to employ stop-lists in order to filter uninformative words of low semantic content (e.g. most grammatical words such as "of", "the", "a"). He also distinguished between bonus words and stigma words, i.e. words that probably occur together with important (e.g. the word form "significant") or unimportant information. His idea of using key-words, i.e. words which occur significantly frequently in the document, is still one of the core heuristics of today's summarizers. With large linguistic corpora available today, the tf–idf value which originated in information retrieval, can be successfully applied to identify the key words of a text: If for example the word "cat" occurs significantly more often in the text to be summarized (TF = "term frequency") than in the corpus (IDF means "inverse document frequency"; here the corpus is meant by "document"), then "cat" is likely to be an important word of the text; the text may in fact be a text about cats.

For more information go to:

https://github.com/miso-belica/sumy/blob/master/sumy/summarizers/edmundson.py
http://martijnwieling.nl/files/wielingvisser05automaticsummarization.pdf


In [41]:
print ("--EdmundsonSummarizer--")     
summarizer = EdmundsonSummarizer() 
words1 = ("brain", "vat", "true", "false")
summarizer.bonus_words = words1
     
words2 = ("Putnam", "metaphysical", "realist", "case")
summarizer.stigma_words = words2
    
words3 = ("philosophers", "theory", " of", "meaning")
summarizer.null_words = words3
for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

'--EdmundsonSummarizer--'
<Sentence: Skepticism and Realism Putnam’s argument Reconstructions of the Argument Brains in a Vat and Self-Knowledge Significance of the Argument References and Further Reading>
<Sentence: There are, however, many self-professed metaphysical realists who are not happy with Putnam’s definition; it saddles the realist with the classical difficulty of matching words to objects and of providing for a correspondence relation between sentences and mind-independent “facts.” The metaphysical realist is forced to construe her thesis ontologically, as an adherence to some fixed furniture of objects in the world, which ignores the possibility that ontological commitment may be specified not as a commitment to a set of entities but rather to the truth of a class of sentences or even of whole theories of the world.>
<Sentence: Assume we are brains in a vat If we are brains in a vat, then “brain” does not refer to brain, and “vat” does not refer to vat (via CC) If “brain 


LexRank is an unsupervised approach to text summarization based on graph-based centrality scoring of sentences. The main idea is that sentences “recommend” other similar sentences to the reader. Thus, if one sentence is very similar to many others, it will likely be a sentence of great importance. The importance of this sentence also stems from the importance of the sentences “recommending” it. Thus, to get ranked highly and placed in a summary, a sentence must be similar to many sentences that are in turn also similar to many other sentences. This makes intuitive sense and allows the algorithms to be applied to any arbitrary new text.

For more information go to:

https://pypi.org/project/lexrank/
https://arxiv.org/pdf/1109.2128.pdf


In [42]:
print ("--LexRankSummarizer--")   
summarizer = LexRankSummarizer()
summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
summarizer.stop_words = ("I", "am", "the", "you", "are", "me", "is", "than", "that", "this")
for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

'--LexRankSummarizer--'
<Sentence: If metaphysical realism is true, then global skepticism is possible If global skepticism is possible, then we can be brains in a vat But we cannot be brains in a vat Thus, metaphysical realism is false (1,2,3)>
<Sentence: Assume we are brains in a vat If we are brains in a vat, then “brain” does not refer to brain, and “vat” does not refer to vat (via CC) If “brain in a vat” does not refer to brains in a vat, then “we are brains in a vat” is false Thus, if we are brains in a vat, then the sentence “We are brains in a vat” is false (1,2,3)>
<Sentence: From (CC) we know that “brains in a vat” does not refer to brain in a vat.>
<Sentence: On this construal of the truth-conditions, “We are brains in a vat” as uttered by a BIV would presumably be false , since a brain in a vat would not have sense-impressions of being a brain in a vat: recall a BIV’s notional world would be equivalent to the unenvatted, and he would appear to himself to be a normally embod