### This Notebook scrapes http://www.earlymoderntexts.com/ 
### And Creates Dataframes/CSV Files For Later Use

In [1]:
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from collections import defaultdict
from string import punctuation
import urllib
import os
import re
import time
import sys
import os
sys.path.append(os.path.abspath('../scrapers'))
import philosopher_profile_data as ppd

### Add New Philosophers on Page to Philosophers Dictionary

In [2]:
philosophers = ppd.western_philosophers()

In [3]:
url = 'http://www.earlymoderntexts.com/texts'
time.sleep(1)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')

In [4]:
authors = [unidecode(x.string) for x in soup.select('li b')]
authors.remove('Isaac Newton (1642-1727)')
authors.remove('Sophie de Grouchy (1764-1822)')
years = [filter(str.isdigit, x) for x in authors]
lifespans = [(x[:4], x[4:]) for x in years]
authors = [re.split(r'[()]', x)[0].strip() for x in authors]
idx = authors.index('Anthony Ashley Cooper, third Earl of Shaftesbury')
authors[idx] = 'Anthony Ashley Cooper'
birth_years = [int(x[0]) for x in lifespans]
death_years = [int(x[1]) for x in lifespans]

In [5]:
print(len(authors))
print(len(birth_years))
print(len(death_years))
for i in range(len(authors)):
    print('\nAuthor: {}, Year Born: {}, Year Died: {}'.format(authors[i], birth_years[i], death_years[i]))

31
31
31

Author: Niccolo Machiavelli, Year Born: 1469, Year Died: 1527

Author: Francis Bacon, Year Born: 1561, Year Died: 1626

Author: Thomas Hobbes, Year Born: 1588, Year Died: 1679

Author: Rene Descartes, Year Born: 1596, Year Died: 1650

Author: Robert Boyle, Year Born: 1627, Year Died: 1691

Author: Anne Conway, Year Born: 1631, Year Died: 1679

Author: Baruch Spinoza, Year Born: 1632, Year Died: 1677

Author: John Locke, Year Born: 1632, Year Died: 1704

Author: Nicolas Malebranche, Year Born: 1638, Year Died: 1715

Author: Gottfried Wilhelm Leibniz, Year Born: 1646, Year Died: 1716

Author: Anthony Ashley Cooper, Year Born: 1671, Year Died: 1713

Author: Samuel Clarke, Year Born: 1675, Year Died: 1729

Author: George Berkeley, Year Born: 1685, Year Died: 1753

Author: Joseph Butler, Year Born: 1692, Year Died: 1752

Author: Francis Hutcheson, Year Born: 1694, Year Died: 1746

Author: Jonathan Edwards, Year Born: 1703, Year Died: 1758

Author: Julien Offray de La Mettrie, Year

In [6]:
print(len(authors))
print(len(philosophers))
print(len([author for author in authors if author not in philosophers.keys()]))

31
83
16


In [7]:
for i in range(len(authors)):
    if authors[i] not in philosophers.keys():
        philosophers = ppd.add_new(philosophers, authors[i], birth_years[i], death_years[i])

In [8]:
print(len(philosophers))
print(len([x for x in philosophers if philosophers[x]['time_period'] == None]))

99
0


In [9]:
philosophers

DefaultOrderedDict([('Thales of Miletos',
                     {'Nationality': 'Greek',
                      'image_path': '/Users/tylerlazoen/philosophy_capstone/images/thales_of_miletos.jpg',
                      'time_period': 'presocratic',
                      'year_born': -624,
                      'year_died': -546}),
                    ('Anaximander',
                     {'Nationality': 'Greek',
                      'image_path': '/Users/tylerlazoen/philosophy_capstone/images/anaximander.jpg',
                      'time_period': 'presocratic',
                      'year_born': -610,
                      'year_died': -546}),
                    ('Anaximenes',
                     {'Nationality': 'Greek',
                      'image_path': '/Users/tylerlazoen/philosophy_capstone/images/anaximenes.jpg',
                      'time_period': 'presocratic',
                      'year_born': -585,
                      'year_died': -525}),
                    ('Pythagoras'

### Create Documents Dictionary and Define Necessary Functions

In [10]:
documents = defaultdict(dict)

In [11]:
def add_document(dct, author, title, year, url, filepath):
    idx = len(dct)
    dct[idx]['title'] = title
    dct[idx]['author'] = author
    dct[idx]['year'] = year
    dct[idx]['pdf_url'] = url
    dct[idx]['pdf_file'] = filepath
    
    return dct

In [12]:
def get_pdfs(author):
    base_link = 'http://www.earlymoderntexts.com/authors/'
    author_link = None

    if author == 'Julien Offray de La Mettrie':
        author_link = base_link + 'lamettrie'
        
    elif author == 'Anthony Ashley Cooper':
        author_link = base_link + 'shaftesbury'
        
    else:
        name_split = author.split()
        last_name = name_split[-1].lower().strip()
        author_link = base_link + last_name
    
    url = author_link
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    text_author = unidecode(soup.select('div.content h2')[0].get_text())
    text_author = re.split(r',', text_author)[0]

    titles = [unidecode(x.get_text()) for x in soup.select('li b')]
    
    if author == 'Thomas Hobbes':
        title = titles[0]
        titles = []
        for i in range(1, 5):
            titles.append(title + ' Part {}'.format(i))
    
    elif author == 'John Locke':
        idx = titles.index('Essay Concerning Human Understanding')
        for i in range(1, 5):
            title = 'Essay Concerning Human Understanding' + ' Part {}'.format(i)
            titles.insert(idx + (i-1), title)
        titles.remove('Essay Concerning Human Understanding')
    
    elif author == 'Gottfried Wilhelm Leibniz':
        idx = titles.index('New Essays on Human Understanding')
        for i in range(1, 5):
            title = 'New Essays on Human Understanding' + ' Book {}'.format(i)
            titles.insert(idx + (i-1), title)
        titles.remove('New Essays on Human Understanding')
        
    elif author == 'Thomas Reid':
        idx1 = titles.index('Essays on the Intellectual Powers of Man')
        
        for i in range(1, 8):
            title = 'Essays on the Intellectual Powers of Man' + ' Essay {}'.format(i)
            titles.insert(idx1 + (i-1), title)

        idx2 = titles.index('Essays on the Active Powers of Man')
        
        for i in range(1, 6):
            title = 'Essays on the Active Powers of Man' + ' Essay {}'.format(i)
            titles.insert(idx2 + (i-1), title)
            
        titles.remove('Essays on the Intellectual Powers of Man')
        titles.remove('Essays on the Active Powers of Man')
    
    elif author == 'David Hume':
        idx = titles.index('A Treatise of Human Nature')
        books = ['I', 'II', 'III']
        
        for i, book in enumerate(books):
            title = 'A Treatise of Human Nature' + ' Book ' + book
            titles.insert(idx + (i), title)
            
        titles.remove('A Treatise of Human Nature')
    
    elif author == 'Adam Smith':
        idx = titles.index('An Inquiry into the Nature and Causes of the Wealth of Nations')
        books = ['Book I', 'Book II', 'Books III and IV', 'Book V']
        
        for i, book in enumerate(books):
            title = 'An Inquiry into the Nature and Causes of the Wealth of Nations' + ' ' + book
            titles.insert(idx + (i), title)
        
        titles.remove('An Inquiry into the Nature and Causes of the Wealth of Nations')
    
    elif author == 'Immanuel Kant':
        idx = titles.index('The Critique of Pure Reason')
        parts = ['First Part', 'Second Part']
        
        for i, part in enumerate(parts):
            title = 'The Critique of Pure Reason' + ' ' + part
            titles.insert(idx + (i), title)
        
        titles.remove('The Critique of Pure Reason')
    
    elif author == 'John Stuart Mill':
        idx1 = titles.index('A System of Logic')
        books = ['I', 'II', 'III', 'IV']
        
        for i, book in enumerate(books):
            title = 'A System of Logic' + ' Book ' + book
            titles.insert(idx1 + (i), title)
        
        idx2 = titles.index('Three Essays on Religion')
        books = ['Helen Taylor\'s Introduction to the Essays', 'Nature', 'The Usefulness of Religion', \
                'Theism']
        
        for i, book in enumerate(books):
            title = book
            titles.insert(idx2 + (i), title)
        
        titles.remove('A System of Logic')
        titles.remove('Three Essays on Religion')
            
    long_titles = [unidecode(x.get_text()) for x in soup.select('li')]
    year_pub = []
    
    for title in titles:
        
        if author == 'Thomas Hobbes':
            year_pub.append(1651)
            continue
        elif title[:36] == 'Essay Concerning Human Understanding' and author == 'John Locke':
            year_pub.append(1690)
            continue
        
        elif title[:33] == 'New Essays on Human Understanding' and author == 'Gottfried Wilhelm Leibniz':
            year_pub.append(1705)
            continue
        
        elif author == 'Thomas Reid':
            if title[:34] == 'Essays on the Active Powers of Man':
                year_pub.append(1788)
                continue
                
            elif title[:40] == 'Essays on the Intellectual Powers of Man':
                year_pub.append(1785)
                continue
            else:
                title = title
            
        elif author == 'David Hume' and title[:26] == 'A Treatise of Human Nature':
            year_pub.append(1739)
            continue
        
        elif author == 'Adam Smith' and 'An Inquiry into the Nature and Causes of the Wealth of Nations':
            year_pub.append(1776)
            continue
        
        elif author == 'Immanuel Kant':
            if title == 'The Critique of Pure Reason First Part':
                year_pub.append(1781)
                continue
                
            elif title == 'The Critique of Pure Reason Second Part':
                year_pub.append(1787)
                continue
            else:
                title = title
            
        elif author == 'John Stuart Mill':
            if title[:17] == 'A System of Logic':
                year_pub.append(1843)
                continue
            elif title == 'Liberty' or title == 'Utilitarianism' or title == 'The Subjection of Women':
                title = title
            else:
                year_pub.append(1873)
                continue
                
        for long_title in long_titles:
            if title in long_title:
                components = re.split(r'[(),]', long_title)
                for component in components:
                    year = filter(str.isdigit, component)
                    if len(year) >= 3:
                        year_pub.append(int(year[:4]))
                        break

    base_pdf_link = 'http://www.earlymoderntexts.com'

    links = [unidecode(x['href']) for x in soup.select('li a')][5:]
    first_pdf = links[0].split('/')[-1]

    pdf_links = [base_pdf_link + links[0]]
    filepaths = [os.path.expanduser('~') + '/philosophy_capstone/pdfs/' + first_pdf]

    last_doc = links[0]
    for link in links[1:]:
        if last_doc[:-4] in link:
            continue
        else:
            text_link = base_pdf_link + link
            pdf_name = link.split('/')[-1]
            filepaths.append(os.path.expanduser('~') + '/philosophy_capstone/pdfs/' + pdf_name)
            pdf_links.append(text_link)
            last_doc = link
    
    return text_author, titles, year_pub, pdf_links, filepaths

In [13]:
documents

defaultdict(dict, {})

### Add Documents to Document Dictionary

In [14]:
for author in authors:
    print('\nReceiving PDF\'s for {}'.format(author))
    text_author, titles, year_pub, pdf_links, filepaths = get_pdfs(author)

    for i in range(len(pdf_links)):
        print('Downloading PDF {} for {}'.format(i, author))
#         urllib.urlretrieve(pdf_links[i], filepaths[i])
#         print('Adding Document')
        documents = add_document(documents, text_author, titles[i], year_pub[i], pdf_links[i], filepaths[i])
#         time.sleep(5)


Receiving PDF's for Niccolo Machiavelli
Adding Document

Receiving PDF's for Francis Bacon
Adding Document

Receiving PDF's for Thomas Hobbes
Adding Document
Adding Document
Adding Document
Adding Document

Receiving PDF's for Rene Descartes
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document

Receiving PDF's for Robert Boyle
Adding Document
Adding Document

Receiving PDF's for Anne Conway
Adding Document

Receiving PDF's for Baruch Spinoza
Adding Document
Adding Document
Adding Document

Receiving PDF's for John Locke
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document

Receiving PDF's for Nicolas Malebranche
Adding Document

Receiving PDF's for Gottfried Wilhelm Leibniz
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding Document
Adding

In [15]:
documents

defaultdict(dict,
            {0: {'author': 'Niccolo Machiavelli',
              'pdf_file': '/Users/tylerlazoen/philosophy_capstone/pdfs/machiavelli1532.pdf',
              'pdf_url': 'http://www.earlymoderntexts.com/assets/pdfs/machiavelli1532.pdf',
              'title': 'The Prince',
              'year': 1532},
             1: {'author': 'Francis Bacon',
              'pdf_file': '/Users/tylerlazoen/philosophy_capstone/pdfs/bacon1620.pdf',
              'pdf_url': 'http://www.earlymoderntexts.com/assets/pdfs/bacon1620.pdf',
              'title': 'The New Organon',
              'year': 1620},
             2: {'author': 'Thomas Hobbes',
              'pdf_file': '/Users/tylerlazoen/philosophy_capstone/pdfs/hobbes1651part1.pdf',
              'pdf_url': 'http://www.earlymoderntexts.com/assets/pdfs/hobbes1651part1.pdf',
              'title': 'Leviathan Part 1',
              'year': 1651},
             3: {'author': 'Thomas Hobbes',
              'pdf_file': '/Users/tylerlazoen/ph

### PDF Text Extraction Functions

In [16]:
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

In [17]:
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text 

In [19]:
def get_text(pdf_file, author):
    text = convert(pdf_file)
    
    start = text.find('Copyright')
    end = text[start:].find(author) + len(text[:start])
    
    if not (start == -1 or end == -1):
        text = text[:start] + text[end-1:]
        name_first = text.find(author)
        idx = name_first + len(author)
        text = text[:idx] + text[idx:].replace(author, '')
    
    text = text.strip(punctuation)
    
    if not(text.isalpha()):
        text_lst = text.split()
        
        for i in range(len(text_lst)):
            if not(text_lst[i].isalpha()):
                text_lst[i] = filter(str.isalnum, text_lst[i])
        
        text = ' '.join(word for word in text_lst)
    
    return text

### Make Documents Dictionary Pandas Dataframe and Add Text

In [20]:
import pandas as pd
lst = []
for key in documents.keys():
    lst.append(documents[key])

In [21]:
documents_df = pd.DataFrame(lst)

In [26]:
import numpy as np
documents_df['text'] = np.nan
documents_df.head()

Unnamed: 0,author,pdf_file,pdf_url,title,year,text
0,Niccolo Machiavelli,/Users/tylerlazoen/philosophy_capstone/pdfs/ma...,http://www.earlymoderntexts.com/assets/pdfs/ma...,The Prince,1532,
1,Francis Bacon,/Users/tylerlazoen/philosophy_capstone/pdfs/ba...,http://www.earlymoderntexts.com/assets/pdfs/ba...,The New Organon,1620,
2,Thomas Hobbes,/Users/tylerlazoen/philosophy_capstone/pdfs/ho...,http://www.earlymoderntexts.com/assets/pdfs/ho...,Leviathan Part 1,1651,
3,Thomas Hobbes,/Users/tylerlazoen/philosophy_capstone/pdfs/ho...,http://www.earlymoderntexts.com/assets/pdfs/ho...,Leviathan Part 2,1651,
4,Thomas Hobbes,/Users/tylerlazoen/philosophy_capstone/pdfs/ho...,http://www.earlymoderntexts.com/assets/pdfs/ho...,Leviathan Part 3,1651,


In [29]:
for i in range(documents_df.shape[0]):
    author = documents_df.loc[i, 'author']
    pdf_file = documents_df.loc[i, 'pdf_file']
    print('\nGetting Text for {}'.format(documents_df.loc[i, 'title']))
    documents_df.loc[i, 'text'] = get_text(pdf_file, author)


Getting Text for The Prince

Getting Text for The New Organon

Getting Text for Leviathan Part 1

Getting Text for Leviathan Part 2

Getting Text for Leviathan Part 3

Getting Text for Leviathan Part 4

Getting Text for Selected Correspondence

Getting Text for Discourse on the Method

Getting Text for Meditations on First Philosophy

Getting Text for Objections to the Meditations, and Descartes' Replies

Getting Text for Correspondence with Princess Elisabeth

Getting Text for Principles of Philosophy

Getting Text for Conversation with Burman

Getting Text for Passions of the Soul

Getting Text for The Origin of Forms and Qualities, Part 1

Getting Text for The Grounds for and Excellence of the Corpuscular or Mechanical Philosophy

Getting Text for The Principles of the Most Ancient and Modern Philosophy

Getting Text for Correspondence

Getting Text for Ethics Demonstrated in Geometrical Order

Getting Text for Treatise on Theology and Politics

Getting Text for Second Treatise of 

In [69]:
documents_df = pd.read_csv('data/documents.csv')
cols = list(documents_df)
cols.insert(0, cols.pop(cols.index('title')))
cols.insert(1, cols.pop(cols.index('author')))
cols.insert(2, cols.pop(cols.index('year')))
cols.insert(3, cols.pop(cols.index('text')))
# cols.pop(cols.index('pdf_url'))
cols.insert(4, cols.pop(cols.index('url')))
cols.append('filepath')
cols.pop(cols.index('pdf_file'))
cols = [x.lower() for x in cols]
documents_df = documents_df.ix[:, cols]
documents_df['url'] = [x['pdf_url'] for x in lst]
documents_df['filepath'] = [x['pdf_file'] for x in lst]
documents_df.drop('unnamed: 0', axis=1, inplace=True)
documents_df.head()

Unnamed: 0,title,author,year,text,url,filepath
0,The Prince,Niccolo Machiavelli,1532,The Prince Niccol Machiavelli Copyright 201020...,http://www.earlymoderntexts.com/assets/pdfs/ma...,/Users/tylerlazoen/philosophy_capstone/pdfs/ma...
1,The New Organon,Francis Bacon,1620,or True Directions Concerning the Interpretati...,http://www.earlymoderntexts.com/assets/pdfs/ba...,/Users/tylerlazoen/philosophy_capstone/pdfs/ba...
2,Leviathan Part 1,Thomas Hobbes,1651,Leviathan Part 1 Man Thomas Hobbes Chapter 3 T...,http://www.earlymoderntexts.com/assets/pdfs/ho...,/Users/tylerlazoen/philosophy_capstone/pdfs/ho...
3,Leviathan Part 2,Thomas Hobbes,1651,Leviathan Part 2 Commonwealth Thomas Hobbes Ch...,http://www.earlymoderntexts.com/assets/pdfs/ho...,/Users/tylerlazoen/philosophy_capstone/pdfs/ho...
4,Leviathan Part 3,Thomas Hobbes,1651,Leviathan Part 3 A Christian Commonwealth Thom...,http://www.earlymoderntexts.com/assets/pdfs/ho...,/Users/tylerlazoen/philosophy_capstone/pdfs/ho...


### Make Philosophers Dictionary a Pandas Dataframe

In [63]:
philosophers

DefaultOrderedDict([('Thales of Miletos',
                     {'Nationality': 'Greek',
                      'image_path': '/Users/tylerlazoen/philosophy_capstone/images/thales_of_miletos.jpg',
                      'name': 'Thales of Miletos',
                      'time_period': 'presocratic',
                      'year_born': -624,
                      'year_died': -546}),
                    ('Anaximander',
                     {'Nationality': 'Greek',
                      'image_path': '/Users/tylerlazoen/philosophy_capstone/images/anaximander.jpg',
                      'name': 'Anaximander',
                      'time_period': 'presocratic',
                      'year_born': -610,
                      'year_died': -546}),
                    ('Anaximenes',
                     {'Nationality': 'Greek',
                      'image_path': '/Users/tylerlazoen/philosophy_capstone/images/anaximenes.jpg',
                      'name': 'Anaximenes',
                      'time_p

In [35]:
phil_list = []
for key in philosophers.keys():
    philosophers[key]['name'] = key
    phil_list.append(philosophers[key])

In [37]:
philosophers_df = pd.DataFrame(phil_list)

In [38]:
philosophers_df.head()

Unnamed: 0,Nationality,image_path,name,time_period,year_born,year_died
0,Greek,/Users/tylerlazoen/philosophy_capstone/images/...,Thales of Miletos,presocratic,-624,-546
1,Greek,/Users/tylerlazoen/philosophy_capstone/images/...,Anaximander,presocratic,-610,-546
2,Greek,/Users/tylerlazoen/philosophy_capstone/images/...,Anaximenes,presocratic,-585,-525
3,Greek,/Users/tylerlazoen/philosophy_capstone/images/...,Pythagoras,presocratic,-570,-490
4,Greek,/Users/tylerlazoen/philosophy_capstone/images/...,Heraclitus,presocratic,-535,-475


In [50]:
cols = list(philosophers_df)
cols.insert(0, cols.pop(cols.index('name')))
cols.insert(2, cols.pop(cols.index('time_period')))
cols.insert(3, cols.pop(cols.index('year_born')))
cols.insert(4, cols.pop(cols.index('year_died')))
cols = [x.lower() for x in cols]
philosophers_df = philosophers_df.ix[:, cols]

In [51]:
philosophers_df.head()

Unnamed: 0,name,nationality,time_period,year_born,year_died,image_path
0,Thales of Miletos,,presocratic,-624,-546,/Users/tylerlazoen/philosophy_capstone/images/...
1,Anaximander,,presocratic,-610,-546,/Users/tylerlazoen/philosophy_capstone/images/...
2,Anaximenes,,presocratic,-585,-525,/Users/tylerlazoen/philosophy_capstone/images/...
3,Pythagoras,,presocratic,-570,-490,/Users/tylerlazoen/philosophy_capstone/images/...
4,Heraclitus,,presocratic,-535,-475,/Users/tylerlazoen/philosophy_capstone/images/...


In [70]:
documents_path = '~/philosophy_capstone/data/documents.csv'
philosophers_path = '~/philosophy_capstone/data/philosophers.csv'

documents_df.to_csv(documents_path)
philosophers_df.to_csv(philosophers_path)