In [1]:
import os
from bs4 import BeautifulSoup

In [2]:
with open('brill/entries/i-000052') as f:
    contents = f.read()

soup = BeautifulSoup(contents, 'html.parser')

In [3]:
# soup.find_all('div', {'class': 'middle-column clearfix'})[0]

In [4]:
entries = {}
for file in sorted(os.listdir('brill/entries/')):
    print(file, end='\r')
    with open(os.path.join('brill', 'entries', file)) as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'html.parser')
    entries[file] = soup.find_all('div', {'class': 'middle-column clearfix'})[0]

i-008353

In [5]:
def next_element(elem):
    while elem is not None:
        # Find next element, skip NavigableString objects
        elem = elem.next_sibling
        if hasattr(elem, 'name'):
            return elem
        
def get_definitions(content):
    definitions = []
    h3_tags = content.find_all('h3')
    for h3tag in h3_tags:
        definition = [str(h3tag)]
        elem = next_element(h3tag)
        while elem and elem.name != 'h3':
            definition.append(str(elem))
            elem = next_element(elem)
        definitions.append('\n'.join(definition))
    return definitions

In [6]:
DICTIONARY = {}
for page_id, entry in entries.items():
    span_hanzi = entry.find_all('span', {'class': 'hanzi'})
    if len(span_hanzi) != 2:
        continue
    assert span_hanzi[0] == span_hanzi[1]
    
    # title
    h1_book_title = entry.find_all('h1', {'class': 'book-title'})[0]
    character = h1_book_title.find('span', {'class': 'hanzi'}).text
    pinyin_citation = h1_book_title.text.split()[1].strip()
    assert pinyin_citation[0] == '(' and pinyin_citation[-1] == ')'
    assert len(character) == 1
    
    # entry contents
    
    article = entry.find_all('article', {'class': 'content clearfix english_content'})
    assert len(article) == 1
    article = article[0]
    content = article.find_all('div', {'class': 'english_content'})
    assert len(content) == 1
    content = content[0]
    
    # structural (radical, strokes)
    structural_desc = content.find_all('p', {'style': 'font-size:smaller;'})
    assert len(structural_desc) == 1
    structural_desc = ' '.join(structural_desc[0].text.split()).strip()
    structural_desc = structural_desc.replace('radical ', '').replace(' strokes', '').split(' + ')
    assert len(structural_desc[0]) == 1 and structural_desc[1].isdigit()
    radical = structural_desc[0]
    n_strokes = int(structural_desc[1])
    
    # individual entries
    definitions = get_definitions(content)
    definitions_list = []
    for definition in definitions:
        definition_dict = {}
        definition_soup = BeautifulSoup(definition, 'html.parser')
        definition_dict['pinyin'] = definition_soup.find('h3').text
        
        definition_dict['MC_reconst'] = ' '.join(definition_soup.find('p').text.split()).strip()
#         print(definition)
        
        senses = []
        
        sense_list = definition_soup.find_all('ol', {'type': '1'})[0]
        for sense in sense_list.children:
            if sense is not None and sense.text.strip():
                sense_li = str(sense).strip()
                assert sense_li.startswith('<li>') and sense_li.endswith('</li>')
                senses.append(sense_li[4:-5].strip())
        
        definition_dict['senses'] = senses
        definitions_list.append(definition_dict)
    
    entry_dict = {
        'pinyin_citation': pinyin_citation,
        'radical': radical,
        'additional_strokes': n_strokes,
        'definitions': definitions_list
    }
    DICTIONARY[character] = entry_dict

In [9]:
import json
with open('kroll_parse.json', 'x') as f:
    json.dump(DICTIONARY, f)