In [8]:
import json, re, time, os
import requests as req
from bs4 import BeautifulSoup
import bs4

In [2]:
with open('data/pages.json', 'r') as f:
    pages = json.load(f)

In [None]:
def parse_footnotes(footnotes_soup: bs4.element.Tag):
    notes = {}
    
    for fn in footnotes_soup.contents:
        if type(fn) == bs4.element.NavigableString:
            continue
        
        fn_text = fn.text.replace('\n', ' ')
        fn_number, *fn_content = fn_text.split()
        
        notes[fn_number] = ' '.join(fn_content)
        
    return notes

# Various HTML elements contain usable text:
# - raw strings
# - italicized phrases
# - greek text
def is_text(tag):
    if type(tag) == bs4.element.NavigableString:
        return True
    
    if tag.name == 'i':
        return True
    
    if tag.name == 'span' and 'greek' in tag.attrs['class']:
        return True
    
    return False

def scrape_chapter(chapter_data):
    content = req.get(chapter_data['url'])
    content_soup = BeautifulSoup(content.text)
    
    footnotes_section = content_soup.find(name='div', class_='footnotes')
    footnotes = parse_footnotes(footnotes_section)
    
    content_text_section = content_soup.find(name='div', class_='text').contents
    paragraphs = []
    
    paragraph_text = ''
    # Compress excess whitespace from replacing newlines into a single space
    big_ws_re = r'\s\s\s*'

    for c in content_text_section:
        if not is_text(c):
            if c.name == 'p' and len(c) == 0:
                p_text = re.sub(big_ws_re, ' ', paragraph_text).strip()
                paragraphs.append(p_text)
                paragraph_text = ''
            elif c.name == 'a' and c.attrs['href'] and 'note' in c.attrs['href']:
                footnote_num = c.attrs['href'][5:]
                paragraph_text += f'<@{footnote_num}>'
            
            continue
        
        # Exclude whitespace strings
        if not c.text.strip():
            continue
        
        c_text = c.text.replace('\n', ' ')
        paragraph_text += c_text

    return {'paragraphs': paragraphs, 'footnotes': footnotes}

In [None]:
n_pages = len(pages)
block_size = 100

block_offset = 0

# Scrape and save chapters in blocks to serve as checkpoints
for n_block in range(block_offset, (n_pages // block_size) + 1):
    chapters_with_data = []
    start = n_block * block_size
    end = start + block_size
    
    print(f'Scraping block #{n_block + 1} ({start}-{end} of {n_pages})')
    
    for i, chapter in enumerate(pages[start:end]):
        print(f'>> Scraping page {i}/{block_size} [{i + start}/{n_pages}] [{chapter["chapter_name"]}, {chapter["book_title"]}]', end='\r')
        info = scrape_chapter(chapter)
        
        chapters_with_data.append({**chapter, **info})
        
        print(' ' * 200, end='\r')

    print(f'Saving block #{n_block + 1}...', end='\r')
    with open(f'data/info-blocks/info_{start}-{end}.json', 'w') as f:
        json.dump(chapters_with_data, f, indent=2, ensure_ascii=False)
    
    print('Saved!' + ' ' * 200)
    
    # Sleep briefly to avoid spamming the site and potentially getting rate-limited
    time.sleep(0.5)

In [12]:
info_blocks = os.listdir('data/info-blocks')

all_chapters = []

for blocks in info_blocks:
    with open(f'data/info-blocks/{blocks}', 'r') as f:
        all_chapters.extend(json.load(f))
        
with open(f'data/chapter-data.json', 'w') as f:
    json.dump(all_chapters, f, indent=2, ensure_ascii=False)