### This Notebook Uses the Gutenberg Module in Python to Obtain Full-Text Documents From Project Gutenberg

In [34]:
import time
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
import re
import urllib
from string import punctuation
import sys
import os
sys.path.append(os.path.abspath('../code'))
from dataframes import Philosophers, Documents
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

In [58]:
phils, docs = Philosophers(filepath='../data/philosophers.csv'), Documents(filepath='../data/documents.csv')

### Using Functions Created in gutenberg_failed_scrape.py to obtain document info (document, title, year made, link)

In [12]:
def get_document_info(tags):
    # Initialize document title variables
    doc_titles = []
    doc_mask = [] # Used for creating authors list later
    tpt_part = 1

    # Initialize link variables
    cleaned_links = []

    # Run through each title and link in the list
    for i, entry in enumerate(tags[1:]):
        try:
            # Get text associated with entry (title of document)
            x = unidecode(entry.string)
            x = re.split(r'[()]', x)[0]

            # Get link associated with entry
            link = 'https:' + unidecode(entry['href']).strip('/')

            # Used for inconsistencies
            if x == 'vol. 1' and i == 48:
                doc_titles.append('A System Of Logic, Ratiocinative And Inductive, 3rd ed. ' + x.strip())

                # Add link information for the document
                cleaned_links.append(link)

            elif x == 'vol. 2' or i == 49:
                doc_titles.append('A System Of Logic, Ratiocinative And Inductive, 7th ed. ' + x.strip())

                # Add link information for the document
                cleaned_links.append(link)

            elif x == '8th ed.':
                doc_titles.append('A System Of Logic, Ratiocinative And Inductive ' + x.strip())

                # Add link information for the document
                cleaned_links.append(link)

            elif i in range(105, 109):
                doc_title = 'Theologico-Political Treatise -- ' + 'part ' + str(tpt_part)
                doc_titles.append(doc_title)
                tpt_part += 1

                # Add link information for the document
                cleaned_links.append(link)

            elif x == 'Butcher trans.' or x == 'audio' or x == 'audiobook' or x[:4] == 'part':
                continue

            else:
                # Append title to the list of document titles
                doc_titles.append(x.strip())
                doc_mask.append(x.strip())

                # Add link information for the document
                cleaned_links.append(link)

        # Check for possible links that aren't for philosophical documents
        except AttributeError:
            continue

    return doc_titles, doc_mask, cleaned_links

In [13]:
def gutenberg_info():
    '''
    Creates and returns work titles with corresponding links and authors
    '''
    # Base url for retrieving information
    url = 'https://www.gutenberg.org/wiki/Philosophy_(Bookshelf)'
    time.sleep(5)
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Obtain titles of documents and links to each one
    titles_links = soup.select('a.extiw')

    doc_titles, doc_mask, links = get_document_info(titles_links)

    # Drop idx of item in order to prevent doubles
    drop_idx = []
    drop_idx.append(doc_titles.index('A System Of Logic, Ratiocinative And Inductive, 3rd ed. vol. 1'))

    # Get the names associated with each document
    names = soup.select('li')

    # Get the cleaned names list
    authors_and_docs = [unidecode(x.get_text()).strip() for x in names[15:126]]
    authors_and_docs = [re.split(r'[()]', name)[0].strip() for i, name in enumerate(authors_and_docs) if i not in drop_idx]
    authors_and_docs.remove('Benedictus de Spinoza Theologico-Political Treatise -- Part 1')

    # Loop through list of author names (with books attached) and get only author names
    authors = []
    for i in range(len(authors_and_docs)):
        name = authors_and_docs[i].replace(doc_mask[i], '').strip()
        authors.append(name)

    # Insert authors for books where need be (i.e. multiple volumes or parts)
    for i in range(len(doc_titles)):
        # Multiple edition/volumes of below work
        if doc_titles[i][:46] == 'A System Of Logic, Ratiocinative And Inductive':
            authors.insert(i, 'John Stuart Mill')

        # 4-part Theologico-Political Treatise
        elif '--' in doc_titles[i]:
            authors.insert(i, 'Benedictus de Spinoza')

    # Account for duplicate documents
    for doc in doc_titles:
        if doc_titles.count(doc) >= 2:
            idx = doc_titles.index(doc)
            authors.pop(idx)
            doc_titles.pop(idx)
            links.pop(idx)

    return authors, doc_titles, links

In [14]:
authors, doc_titles, links = gutenberg_info()

INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): www.gutenberg.org


In [56]:
ids = [int(re.sub('\D', '', x)) for x in links]
for x in zip(authors, zip(doc_titles, ids)):
    print(x)
print(len(doc_titles))
print(len(authors))
print(len(ids))
print(len(links))

('Emile Faguet', ('Initiation into Philosophy', 9304))
('George S. Fullerton', ('An Introduction to Philosophy', 16406))
('Ralph Barton Perry', ('The Approach to Philosophy', 25110))
('John Marshall', ('A Short History of Greek Philosophy', 20500))
('St. George William Joseph Stock', ('Guide to Stoicism', 7514))
('Mary Mills Patrick', ('Sextus Empiricus and Greek Scepticism', 17556))
('Richard Falckenberg', ('History of Modern Philosophy From Nicolas of Cusa to the Present Time', 11100))
('George Santayana', ('Some Turns of Thought in Modern Philosophy: Five Essays', 16712))
('John Alexander Gunn', ('Modern French Philosophy: a Study of the Development Since Comte', 5246))
('Anicius Manlius Severinus Boethius', ('The Theological Tractates and The Consolation of Philosophy', 13316))
('Friedrich Nietzsche', ('Also sprach Zarathustra', 7205))
('Friedrich Nietzsche', ('Thus Spake Zarathustra', 1998))
('Abel J. Jones', ('Rudolph Eucken: a philosophy of life', 14357))
('David Starr Jordan', 

In [45]:
text = strip_headers(load_etext(ids[0])).strip()

### Get the text for each document and add to Documents Dataframe (if they are in the philosophers dataframe)

In [60]:
for i in range(len(ids)):
    author = authors[i].lower().strip()
    title = doc_titles[i].lower().strip()
    url = links[i].lower().strip()
    year = 0
    
    if author in phils.df.name.values:
        try:
            print('\nGetting text for document {} by {}'.format(title, author))
            text = strip_headers(load_etext(ids[i])).strip()
            beg_end = text.find('PREFACE')
            end_start = text.find('INDEX OF NAMES')
            text = text[beg_end:] + text[:end_start]

            print('Adding document')
            docs.add_document(author, title, year, text, url)
        except ValueError:
            print('Download URI for {} not supported'.format(ids[i]))
    else:
        print('\nThat Author isn\'t one of the philosophers!')


That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

Getting text for document also sprach zarathustra by friedrich nietzsche
Adding document
Document also sprach zarathustra already exists!

Getting text for document thus spake zarathustra by friedrich nietzsche
Adding document
Document thus spake zarathustra already exists!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

Getting text for document the categories by aristotle
Adding document
Document the categories already exists!

That Author isn't one of the philosophers!

That Author isn't one of the philosopher

INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document the analysis of mind already exists!

Getting text for document tractatus logico-philosophicus by ludwig wittgenstein


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Download URI for 5740 not supported

Getting text for document a system of logic, ratiocinative and inductive, 3rd ed. vol. 1 by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document a system of logic, ratiocinative and inductive, 7th ed. vol. 1 by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document a system of logic, ratiocinative and inductive, 7th ed. vol. 2 by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document a system of logic, ratiocinative and inductive 8th ed. by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us



That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

Getting text for document ion by plato


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document ion already exists!

Getting text for document poetics by aristotle


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

Getting text for document die geburt der tragodie by friedrich nietzsche


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document the case of wagner, nietzsche contra wagner, and selected aphorisms by friedrich nietzsche


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document homer and classical philology by friedrich nietzsche


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

That Author isn't one of the philosophers!

Getting text for document an enquiry concerning the principles of morals by david hume


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document an enquiry concerning the principles of morals already exists!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

Getting text for document beyond good and evil by friedrich nietzsche


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document utilitarianism by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document utilitarianism already exists!

That Author isn't one of the philosophers!

Getting text for document apology by plato


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document apology already exists!

Getting text for document the republic by plato


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document politics by aristotle


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document the prince by niccolo machiavelli


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document the prince already exists!

Getting text for document second treatise of government by john locke


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document second treatise of government already exists!

That Author isn't one of the philosophers!

Getting text for document selected essays by karl marx


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document on liberty by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document on liberty already exists!

Getting text for document considerations on representative government by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document essays on some unsettled questions of political economy by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document proposed roads to freedom by bertrand russell


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document political ideals by bertrand russell


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

That Author isn't one of the philosophers!

Getting text for document literary and philosophical essays: french, german and italian by michel de montaigne


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document democracy and education: an introduction to the philosophy of education by john dewey


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document moral principles in education by john dewey


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document euthyphro by plato


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document euthyphro already exists!

Getting text for document dialogues concerning natural religion by david hume


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document
Document dialogues concerning natural religion already exists!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

Getting text for document mysticism and logic and other essays by bertrand russell


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

Getting text for document the antichrist by friedrich nietzsche


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

Getting text for document achtundvierzig briefe von johann gottlieb fichte und seinen verwandten by fichte by johann gottlieb fichte


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

Getting text for document autobiography by john stuart mill


INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Adding document

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!

That Author isn't one of the philosophers!


In [65]:
docs.save_df()