In [1]:
import bs4 as bs
import json

In [2]:
def count(text):
    pc = len(text) # number of paragraphs
    wc = sum([len(par.split()) for par in text]) # word count
    return pc, wc

In [3]:
def print_titles(j_dict):
    print('Number of chapters:', len(j_dict), '\n')
    for text in j_dict:
        print(text['title'])

## Essays

In [4]:
with open('hume_xml/hume.07.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")
    
chapters = file.find_all('div', type='chapter')

j_dict = []
for chapter in chapters[3:]: # first 3 are not essays
    
    title = chapter.find('head').text
    text = [par.text.rstrip('\n').replace('\n', ' ') for par in chapter.find('div', type='section').find_all('p')]
    pc, wc = count(text)
    
    text_dict = {'genre' : 'essay',
                 'title' : title,
                 'word-count': wc,
                 'paragraph-count':pc,
                 'text' : text}  
    j_dict.append(text_dict)
    
print_titles(j_dict)
    
with open('hume_json/essays.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

Number of chapters: 49 

Essay 1: Of the Delicacy of Taste and Passion
Essay 2: Of the Liberty of the Press
Essay 3: That Politics may be Reduced to a Science
Essay 4: Of the First Principles of Government
Essay 5: Of the Origin of Government
Essay 6: Of the Independency of Parliament
Essay 7: Whether the British Government inclines more to Absolute Monarchy, or to a Republic
Essay 8: Of Parties in General
Essay 9: Of the Parties of Great Britain
Essay 10: Of Superstition and Enthusiasm
Essay 11: Of the Dignity or Meanness of Human Nature
Essay 12: Of Civil Liberty
Essay 13: Of Eloquence
Essay 14: Of the Rise and Progress of the Arts and Sciences
Essay 15: The Epicurean
Essay 16: The Stoic
Essay 17: The Platonist
Essay 18: The Sceptic
Essay 19: Of Polygamy and Divorces
Essay 20: Of Simplicity and Refinement in Writing
Essay 21: Of National Characters
Essay 22: Of Tragedy
Essay 23: Of the Standard of Taste
Essay 1: Of Commerce
Essay 2: Of Refinement in the Arts
Essay 3: Of Money
Essay 4

## History

In [5]:
def get_history(volume, j_dict):
    
    print(f'hume_xml/hume.1{volume}.xml')
    
    with open(f'hume_xml/hume.1{volume}.xml', 'r') as file:
        file = bs.BeautifulSoup(file.read(), "xml")
    
    chapters = file.find_all('div', type='chapter')
    
    for chapter in chapters:

        head = chapter.find('head').text.split('\n')
        number = head[0]

        text = [par.text.rstrip('\n').replace('\n\n', ' ') for par in chapter.find_all('p', rend="tbindent") if par.parent['type'] != 'footnote' ]

        if not text or number == 'Table of Contents' or number == ' An Historian at Work':
            continue
            
        pc, wc = count(text)

        text_dict = {'genre' : 'history',
                     'title' : number,
                     'word-count': wc,
                     'paragraph-count':pc,
                     'text' : text}  
        j_dict.append(text_dict)
        
    return j_dict

In [6]:
j_dict = []
for i in range(1, 7):
    get_history(i, j_dict)

print()
print_titles(j_dict)

with open('hume_json/history.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

hume_xml/hume.11.xml
hume_xml/hume.12.xml
hume_xml/hume.13.xml
hume_xml/hume.14.xml
hume_xml/hume.15.xml
hume_xml/hume.16.xml

Number of chapters: 75 

I
II
III
APPENDIX I
IV
V
VI
VII
VIII 
IX
X
XI
APPENDIX II
XII
XIII
XIV
XV
XVI
XVII
XVIII
XIX
XX
XXI
XXII
XXIII
XXIV
XXV
XXVI
XXVII
XXVIII
XXIX
XXX
XXXI
XXXII
XXXIII
XXXIV
XXXV
XXXVI
XXXVII
XXXVIII
XXXIX
XL
XLI
XLII
XLIII
XLIV
APPENDIX III
XLV
XLVI
XLVII
XLVIII
XLIX
APPENDIX TO THE REIGN OF JAMES I{Va(1754), 116: [note]}^*
L
LI
LII
LIII
LIV
LV
LVI
LVII
LVIII 
LIX
LX
LXI
LXII
LXIII
LXIV
LXV
LXVI
LXVII
LXVIII
LXIX
LXX
LXXI


## Treatise

In [7]:
with open('hume_xml/hume.02.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")
    
# missing Introduction and chapter/book division
sections = file.find_all('div', type='section')

j_dict = []
for section in sections: 
    
    title = section.find('head').text
    
    text = [par.text.rstrip('\n').replace('\n', ' ') for par in section.find_all('p')]
    
    pc, wc = count(text)
    
    text_dict = {'genre' : 'treatise',
                 'title' : title,
                 'word-count': wc,
                 'paragraph-count':pc,
                 'text' : text}  
    
    j_dict.append(text_dict)
    
print_titles(j_dict)

with open('hume_json/treatise.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

Number of chapters: 90 

SECTION I: OF THE ORIGIN OF OUR IDEAS
SECTION II: DIVISION OF THE SUBJECT
SECTION III: OF THE IDEAS OF THE MEMORY AND IMAGINATION
SECTION IV: OF THE CONNEXION OR ASSOCIATION OF IDEAS
SECTION V: OF RELATIONS
SECTION VI: OF MODES AND SUBSTANCES
SECTION VII: OF ABSTRACT IDEAS
SECTION I: OF THE INFINITE DIVISIBILITY OF OUR IDEAS OF SPACE AND TIME
SECTION II: OF THE INFINITE DIVISIBILITY OF SPACE AND TIME
SECTION III: OF THE OTHER QUALITIES OF OUR IDEAS OF SPACE AND TIME
SECTION IV: OBJECTIONS ANSWERED
SECTION V: THE SAME SUBJECT CONTINUED
SECTION VI: OF THE IDEAS OF EXISTENCE, AND OF EXTERNAL EXISTENCE
SECTION I: OF KNOWLEDGE
SECTION II: OF PROBABILITY, AND OF THE IDEA OF CAUSE AND EFFECT
SECTION III: WHY A CAUSE IS ALWAYS NECESSARY
SECTION IV: OF THE COMPONENT PARTS OF OUR REASONINGS CONCERNING CAUSE AND EFFECT
SECTION V: OF THE IMPRESSIONS OF THE SENSES AND MEMORY
SECTION VI: OF THE INFERENCE FROM THE IMPRESSION TO THE IDEA
SECTION VII: OF THE NATURE OF IDEA OR B

## Enquiries

In [8]:
with open('hume_xml/hume.05.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")

chapters = file.find_all('div', type='chapter')

j_dict = []

# ignore table of contents, publishing info, author's advertisment, footnotes, variants
for chapter in chapters[3:-2]: 
    
    title = chapter.find('head').text
    
    text = [par.text.rstrip('\n').replace('\n', ' ') for par in chapter.find_all('p')]
    
    pc, wc = count(text)
    
    text_dict = {'genre' : 'enquiry HU',
                 'title' : title,
                 'word-count': wc,
                 'paragraph-count':pc,
                 'text' : text}  
    
    j_dict.append(text_dict)

In [9]:
with open('hume_xml/hume.06.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")
    
chapters = file.find_all('div', type='chapter')

# ignore table of contents, publishing info, author's advertisment, footnotes, variants, list of editions
for chapter in chapters[2:-3]: 
    
    title = chapter.find('head').text
    
    text = [par.text.rstrip('\n').replace('\n', ' ') for par in chapter.find_all('p')]
    
    pc, wc = count(text)
    
    text_dict = {'genre' : 'enquiry PoM',
                 'title' : title,
                 'word-count': wc,
                 'paragraph-count':pc,
                 'text' : text}  
    j_dict.append(text_dict)

In [10]:
print_titles(j_dict)
with open('hume_json/enquiries.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

Number of chapters: 26 

Section 1: Of the Different Species of Philosophy
Section 2: Of the Origin of Ideas
Section 3: Of the Association of Ideas
Section 4: Sceptical Doubts concerning the Operations of the Understanding
Section 5: Sceptical Solution of these Doubts
Section 6: Of Probability
Section 7: Of the Idea of Necessary Connexion
Section 8: Of Liberty and Necessity
Section 9: Of the Reason of Animals
Section 10: Of Miracles
Section 11: Of a Particular Providence and of a Future State
Section 12: Of the Academical or Sceptical Philosophy
Section 1: Of the General Principles of Morals
Section 2 : Of Benevolence
Section 3: Of Justice
Section 4: Of Political Society
Section 5: Why Utility Pleases
Section 6: Of Qualities useful to Ourselves
Section 7: Of Qualities immediately Agreeable to Ourselves
Section 8: Of Qualities immediately Agreeable to Others
Section 9: Conclusion
Appendix 1: Concerning Moral Sentiment
Appendix 2: Of Self-Love
Appendix 3: Some Farther Considerations with

## A Dissertation of The Passions

In [11]:
with open('hume_xml/hume.08.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")

chapters = file.find_all('div', type='chapter')

j_dict = []

# ignore table of contents, publishing info, footnotes, variants
for chapter in chapters[2:-2]: 
    title = chapter.find('head').text

    text = [par.text.rstrip('\n').replace('\n', ' ') for par in chapter.find_all('p')]
    
    pc, wc = count(text)
    
    text_dict = {'genre' : 'dissertation',
                 'title' : title,
                 'word-count': wc,
                 'paragraph-count':pc,
                 'text' : text}  
    
    j_dict.append(text_dict)
    
print_titles(j_dict)
with open('hume_json/dissertation.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

Number of chapters: 6 

Section 1
Section 2
Section 3
Section 4
Section 5
Section 6


## Natural History

In [12]:
with open('hume_xml/hume.09.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")

chapters = file.find_all('div', type='chapter')

j_dict = []

# ignore table of contents, publishing info, footnotes, variants
for chapter in chapters[2:-2]: 
    title = chapter.find('head').text

    text = [par.text.rstrip('\n').replace('\n', ' ') for par in chapter.find_all('p')]
    
    pc, wc = count(text)
    
    text_dict = {'genre' : 'natural history',
                 'title' : title,
                 'word-count': wc,
                 'paragraph-count':pc,
                 'text' : text}  
    
    j_dict.append(text_dict)
    
print_titles(j_dict)

with open('hume_json/natural history.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

Number of chapters: 16 

Introduction
Section 1: That Polytheism was the Primary Religion of Man
Section 2: Origin of Polytheism
Section 3: The Same Subject Continued
Section 4: Deities not considered as Creators or Formers of the World
Section 5: Various Forms of Polytheism: Allegory, Hero-Worship
Section 6: Origin of Theism from Polytheism
Section 7: Confirmation of this Doctrine
Section 8: Flux and Reflux of Polytheism and Theism
Section 9: Comparison of these Religions, with regard to Persecution and Toleration
Section 10: With regard to Courage or Abasement
Section 11: With regard to Reason or Absurdity
Section 12: With regard to Doubt or Conviction
Section 13: Impious Conceptions of the Divine Nature in Popular Religions of both Kinds
Section 14: Bad Influence of Popular Religions on Morality
Section 15: General Corollary


## Dialogues 

In [13]:
with open('hume_xml/hume.10.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")

chapters = file.find_all('div', type='chapter')

j_dict = []

# ignore table of contents, publishing info, footnotes, variants
for chapter in chapters[2:-1]: 
    title = chapter.find('head').text

    text = [par.text.rstrip('\n').replace('\n', ' ') for par in chapter.find_all('p')]
    
    pc, wc = count(text)
    
    text_dict = {'genre' : 'dialogues',
                 'title' : title,
                 'word-count': wc,
                 'paragraph-count':pc,
                 'text' : text}  
    
    j_dict.append(text_dict)
    
print_titles(j_dict)
    
with open('hume_json/dialogues.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

Number of chapters: 13 

Pamphilus to Hermippus
Part 1
Part 2
Part 3
Part 4
Part 5
Part 6
Part 7
Part 8
Part 9
Part 10
Part 11
Part 12


## An Abstract of A Treatise of Human Nature

In [14]:
with open('hume_xml/hume.03.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")

books = file.find_all('div', type='book')

paragraphs = []

paragraphs.append(books[0].find_all('p', rend='center')[-1])
paragraphs.extend(books[0].find_all('p', rend='tbindent'))

paragraphs.append(books[1].find_all('p', rend='center')[-2]) # missing FINIS. now
paragraphs.extend(books[1].find_all('p', rend='tbindent'))

j_dict = []
title = 'An Abstract of A Treatise of Human Nature'

text = [par.text.rstrip('\n').replace('\n', ' ') for par in paragraphs]

pc, wc = count(text)

text_dict = {'genre' : 'abstract',
             'title' : title,
             'word-count': wc,
             'paragraph-count':pc,
             'text' : text}  
j_dict.append(text_dict)

with open('hume_json/abstract.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

## A letter from a Gentleman

In [15]:
with open('hume_xml/hume.04.xml', 'r') as file:
    file = bs.BeautifulSoup(file.read(), "xml")

books = file.find_all('div', type='book')

j_dict = []

title = 'A Letter from a Gentleman'

text = [par.text.rstrip('\n').replace('\n', ' ') for par in books[1].find_all('p')]

pc, wc = count(text)

text_dict = {'genre' : 'letter',
             'title' : title,
             'word-count': wc,
             'paragraph-count':pc,
             'text' : text}  
j_dict.append(text_dict)

with open('hume_json/letter.json', 'w') as file:
    json.dump(j_dict, file, indent=3)

## Merge all files into one

In [16]:
!jq -s 'add' hume_json/* > all_raw.json

In [17]:
import pandas as pd

In [18]:
df = pd.read_json('all_raw.json')

In [19]:
df = df.drop(['text'], axis=1)

In [20]:
df.to_csv('metadata.csv', index=False)