### This notebook scrapes the site http://sacred-texts.com/

In [1]:
import numpy as np
import pandas as pd
import requests
import urllib
from bs4 import BeautifulSoup
from collections import defaultdict
import sys
import os
sys.path.append(os.path.abspath('../scrapers'))
from dataframes import Philosophers, Documents
import re
import os
from string import punctuation

### Loading Dataframes and Cleaning Documents

In [21]:
phils, docs = Philosophers(filepath='../data/philosophers.csv'), Documents(filepath='../data/documents.csv')

In [22]:
phils.df.head()

Unnamed: 0,name,year_born,year_died,century,time_period,era,image_path
0,thales of miletos,-624,-546,-500,presocratic,ancient,/Users/tylerlazoen/philosophy_capstone/images/...
1,anaximander,-610,-546,-500,presocratic,ancient,/Users/tylerlazoen/philosophy_capstone/images/...
2,anaximenes,-585,-525,-500,presocratic,ancient,/Users/tylerlazoen/philosophy_capstone/images/...
3,pythagoras,-570,-490,-500,presocratic,ancient,/Users/tylerlazoen/philosophy_capstone/images/...
4,heraclitus,-535,-475,-500,presocratic,ancient,/Users/tylerlazoen/philosophy_capstone/images/...


In [23]:
print(docs.df.shape)
docs.df.head()

(170, 6)


Unnamed: 0,title,author,year,text,url,filepath
0,the prince,niccolo machiavelli,1532,the prince niccol machiavelli copyright 201020...,http://www.earlymoderntexts.com/assets/pdfs/ma...,/Users/tylerlazoen/philosophy_capstone/pdfs/ma...
1,the new organon,francis bacon,1620,or true directions concerning the interpretati...,http://www.earlymoderntexts.com/assets/pdfs/ba...,/Users/tylerlazoen/philosophy_capstone/pdfs/ba...
2,selected correspondence,rene descartes,1619,selected correspondence of descartes ren desca...,http://www.earlymoderntexts.com/assets/pdfs/de...,/Users/tylerlazoen/philosophy_capstone/pdfs/de...
3,discourse on the method,rene descartes,1637,discourse on the method of rightly conducting ...,http://www.earlymoderntexts.com/assets/pdfs/de...,/Users/tylerlazoen/philosophy_capstone/pdfs/de...
4,meditations on first philosophy,rene descartes,1641,in which are demonstrated the existence of god...,http://www.earlymoderntexts.com/assets/pdfs/de...,/Users/tylerlazoen/philosophy_capstone/pdfs/de...


In [86]:
last_title = docs.df.title.values[0]
for title in docs.df.title.values[1:]:
    if last_title[:8] in title:
        print(title)
    last_title = title

# leviathan, essay concerning human understanding, new essays on human understanding,
# essays on the intellectual powers of man, essays on the active powers of man,
# a treatise to human nature, an inquiry into the nature and causes of the wealth of nations
# a critique of pure reason, a system of logic

leviathan part 2
leviathan part 3
leviathan part 4
objections to the meditations, and descartes' replies
second treatise of government
essay concerning human understanding part 2
essay concerning human understanding part 3
essay concerning human understanding part 4
new essays on human understanding book 2
new essays on human understanding book 3
new essays on human understanding book 4
exchange of papers with leibniz
an inquiry concerning virtue and morality
essays on the intellectual powers of man essay 2
essays on the intellectual powers of man essay 3
essays on the intellectual powers of man essay 4
essays on the intellectual powers of man essay 5
essays on the intellectual powers of man essay 6
essays on the intellectual powers of man essay 7
essays on the active powers of man essay 1
essays on the active powers of man essay 2
essays on the active powers of man essay 3
essays on the active powers of man essay 4
essays on the active powers of man essay 5
a treatise of human nature 

In [99]:
part_texts = ['leviathan', 'essay concerning human understanding', 'new essays on human understanding',\
              'essays on the intellectual powers of man', 'essays on the active powers of man', \
              'a treatise of human nature', 'an inquiry into the nature and causes of the wealth of nations', \
              'the critique of pure reason', 'a system of logic']

In [108]:
for text_name in part_texts:
    lst_title = [x for x in docs.df.title.values if x[:len(text_name)] == text_name]
    idxs = [docs.df[docs.df.title == x].index[0] for x in lst_title]
    author = docs.df.loc[idxs[0], 'author']
    year = docs.df.loc[idxs[0], 'year']
    text = ''
    pdfs = []
    urls = []
    for idx in idxs:
        text += docs.df.loc[idx, 'text']
        pdfs.append(docs.df.loc[idx, 'filepath'])
        urls.append(docs.df.loc[idx, 'url'])
        
    new_entry = {'title': text_name,
                 'author': author,
                 'year': year,
                 'text': text,
                 'url': urls,
                 'filepath': pdfs}
    docs.df = docs.df.append(new_entry, ignore_index=True)
    docs.df.drop(idxs, inplace=True)

In [111]:
docs.save_df()

### Scraping Sacred Texts

In [132]:
base_url = 'http://sacred-texts.com/phi/'
r = requests.get(base_url)
soup = BeautifulSoup(r.content, 'lxml')

In [236]:
names = [x.get_text().strip() for x in soup.select('h3')][:-1]
names.remove('Epicurus')
new_names = []
titles_dates = [x.get_text() for x in soup.select('span.c_e')]
links = [x['href'].strip() for x in soup.select('span.c_e a')]
links

i = 0
titles = []
dates = []
for name in names:

    if i == 70: 
        break
    link_base = re.split(r'/', links[i])[0]
    
    for entry in titles_dates[i:]:
        
        if link_base in links[i]:
            if i == 9:
                name = 'Epicurus'
                
            new_names.append(name)

            title = re.split(r'[[\]]', entry)[0].strip()

            if title == 'Discourse on the Method of Rightly Conducting the Reason, and Seeking Truth in the Sciences':
                title = 'Discourse on the Method'
            
            try:
                date = re.split(r'[[\]]', entry)[1]
                if 'BCE' in date:
                    date = -1 * int(filter(str.isdigit, date))
                else:
                    date = int(filter(str.isdigit, date))
                dates.append(date)
            except IndexError:
                dates.append(0)
            titles.append(title)
            i += 1
        else:
            break

links = ['http://sacred-texts.com/phi/' + x for x in links]
links = [x.replace('phi/../', '') if '..' in x else x for x in links]
idxs = [links.index(x) for x in links if 'spinoza' in x]
for i, idx in enumerate(idxs):
    new_names.pop(idx-i)
    titles.pop(idx-i)
    dates.pop(idx-i)
    links.pop(idx-i)
    
print(len(new_names))
print(len(titles))
print(len(dates))
links

67
67
67


['http://sacred-texts.com/phi/berkeley/three.txt',
 'http://sacred-texts.com/phi/berkeley/treatise.txt',
 'http://sacred-texts.com/phi/desc/disc.txt',
 'http://sacred-texts.com/phi/desc/med.txt',
 'http://sacred-texts.com/phi/emerson/trans.txt',
 'http://sacred-texts.com/phi/emerson/essay1.txt',
 'http://sacred-texts.com/phi/emerson/essay2.txt',
 'http://sacred-texts.com/phi/epi/enchir.txt',
 'http://sacred-texts.com/phi/epi/disc.txt',
 'http://sacred-texts.com/phi/epi/letter.txt',
 'http://sacred-texts.com/phi/hobbes/leviath.txt',
 'http://sacred-texts.com/phi/hume/of1.txt',
 'http://sacred-texts.com/phi/hume/of5.txt',
 'http://sacred-texts.com/phi/hume/of6.txt',
 'http://sacred-texts.com/phi/hume/of4.txt',
 'http://sacred-texts.com/phi/hume/of7.txt',
 'http://sacred-texts.com/phi/hume/enquiry.txt',
 'http://sacred-texts.com/phi/hume/letter.txt',
 'http://sacred-texts.com/phi/hume/essays.txt',
 'http://sacred-texts.com/phi/hume/natural.txt',
 'http://sacred-texts.com/phi/hume/of2.txt'

### Add Documents to Dataframe

In [267]:
for i in range(len(links)):
    url = links[i]
    author = new_names[i].lower()
    title = titles[i].lower()
    year = dates[i]
    text = []
    
    if title not in docs.df.title.values:
        print('\n{} documents remaining'.format(len(links) - i))
        filepath = os.path.expanduser('~') + '/philosophy_capstone/text_files/' + title.lower()
        print('Downloading file for {}'.format(title))
        urllib.urlretrieve(url, filepath)
        with open(filepath) as f:
            text = f.read()
        text = docs.clean_text(text)
        print('Adding Document')
        docs.add_document(author, title, year, text, url, filepath=filepath)


66 documents remaining
Downloading file for a treatise concerning the principles of human knowledge
Adding Document

63 documents remaining
Downloading file for the transcendalist
Adding Document

62 documents remaining
Downloading file for essays, first series
Adding Document

61 documents remaining
Downloading file for essays, second series
Adding Document

60 documents remaining
Downloading file for the enchiridion
Adding Document

59 documents remaining
Downloading file for the discourses
Adding Document

58 documents remaining
Downloading file for letter to menoeceus
Adding Document

56 documents remaining
Downloading file for of superstition and enthusiasm
Adding Document

55 documents remaining
Downloading file for of the delicacy of taste and passion
Adding Document

54 documents remaining
Downloading file for of the liberty of the press
Adding Document

53 documents remaining
Downloading file for of essay writing
Adding Document

52 documents remaining
Downloading file for of

In [268]:
print(docs.df.shape)
docs.df.tail()

(158, 6)


Unnamed: 0,title,author,year,text,url,filepath
153,a plea for captain john brown,henry david thoreau,1853,1853 a plea for captain john brown by henry da...,http://sacred-texts.com/phi/thoreau/plea.txt,/Users/tylerlazoen/philosophy_capstone/text_fi...
154,slavery in massachusetts,henry david thoreau,1854,1854 slavery in massachusetts by henry david t...,http://sacred-texts.com/phi/thoreau/slavery.txt,/Users/tylerlazoen/philosophy_capstone/text_fi...
155,life without principle,henry david thoreau,1863,1863 life without principle by henry david tho...,http://sacred-texts.com/phi/thoreau/life.txt,/Users/tylerlazoen/philosophy_capstone/text_fi...
156,civil disobedience,henry david thoreau,1849,1849 civil disobedience by henry david thoreau...,http://sacred-texts.com/phi/thoreau/civil.txt,/Users/tylerlazoen/philosophy_capstone/text_fi...
157,"walden, or life in the woods",henry david thoreau,1854,1854 walden or life in the woods by henry davi...,http://sacred-texts.com/phi/thoreau/walden.txt,/Users/tylerlazoen/philosophy_capstone/text_fi...


In [271]:
docs.save_df()