In [1]:
from pathlib import Path
import json
from lxml.etree import strip_tags, strip_elements, parse

# Simple Keywords Searching and Extraction

In [2]:
def extract_ch_json(doc_path, doc_type, doc_id):
    """Return JSON from Grobid TEI

    This function tries to group sections of Grobid-generated TEI as chapters.
    """
    try:
        doc = parse(doc_path)
    except Exception as e:
        print('%s: %s' % (type(e).__name__, e))
        return 0

    # initial dict for information storage
    out_dict = {'id': doc_id, 'type': doc_type, 'title': '', 'chapters': []}

    # get document title
    titles = doc.xpath('/tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title',
                       namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
    out_dict['title'] = '' if titles[0] is None else titles[0].text

    # get all tei:div elements that have headings
    divs = doc.xpath('/tei:TEI/tei:text/tei:body/tei:div[tei:head]',
                     namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

    iterator = iter(divs)
    div = next(iterator, None)
    while div is not None:
        head = div.find('tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
        n = head.get('n')
        if n is None:  # assume this is a chapter heading
            chapter = {'title': f'{head.text}', 'paragraphs': []}
            paragraphs = []
            ps = div.findall('tei:p', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
            for p in ps:
                # get rid of the <ref> elements
                strip_elements(p, '{http://www.tei-c.org/ns/1.0}ref', with_tail=False)
                # remove all other tags, but keep their content (e.g., <b>, etc)
                strip_tags(p, '{http://www.tei-c.org/ns/1.0}*')
                paragraphs.append(p.text)

            div = next(iterator, None)
            while True:
                if div is None:
                    chapter['paragraphs'] = paragraphs
                    out_dict['chapters'].append(chapter)
                    break
                head = div.find('tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                n = head.get('n')
                if n is not None:  # assume this is a subsection heading
                    paragraphs.append(f'{n} {head.text}')
                    ps = div.findall('tei:p', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                    for p in ps:
                        # get rid of the <ref> elements
                        strip_elements(p, '{http://www.tei-c.org/ns/1.0}ref', with_tail=False)
                        # remove all other tags, but keep their content (e.g., <b>, etc)
                        strip_tags(p, '{http://www.tei-c.org/ns/1.0}*')
                        paragraphs.append(p.text)
                else:
                    if paragraphs:
                        chapter['paragraphs'] = paragraphs
                        out_dict['chapters'].append(chapter)
                    break  # break out of this loop

                div = next(iterator, None)
        else:
            chapter = {'title': '', 'paragraphs': []}
            paragraphs = []
            while True:
                if div is None:
                    chapter['paragraphs'] = paragraphs
                    out_dict['chapters'].append(chapter)
                    break
                head = div.find('tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                n = head.get('n')
                if n is not None:  # assume this is a subsection heading
                    paragraphs.append(f'{n} {head.text}')
                    ps = div.findall('tei:p', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                    for p in ps:
                        # get rid of the <ref> elements
                        strip_elements(p, '{http://www.tei-c.org/ns/1.0}ref', with_tail=False)
                        # remove all other tags, but keep their content (e.g., <b>, etc)
                        strip_tags(p, '{http://www.tei-c.org/ns/1.0}*')
                        paragraphs.append(p.text)
                else:
                    if paragraphs:
                        chapter['paragraphs'] = paragraphs
                        out_dict['chapters'].append(chapter)
                    break  # break out of this loop

                div = next(iterator, None)

    return out_dict

## Sample Result 

In [3]:
docPath = "/Users/waingram/Desktop/gorbid_fulltext/theses/17262/Molinaro_NJ_T_2017.tei.xml"
print(json.dumps(extract_ch_json(docPath, "thesis", 17292), indent=4, sort_keys=True))


{
    "chapters": [
        {
            "paragraphs": [
                "Any fan or propeller that ingests any unsteady flow will produce noise. This is especially important in propeller aircraft and marine vehicles where turbulence is generated from appendages on the vehicle's body. This self-generated turbulence travels downstream and is eventually drawn into the propeller and produces noise. The broad study that the present work is a part of is concerned with understanding this ingestion noise problem so that the interaction can be better modeled and the sound produced can be predicted. To predict the sound produced by a fan or propeller ingesting turbulence, detailed information about the inflow condition is needed. In the present study the turbulence structure of the wake shed by a circular cylinder at 20 meters per second. The two-point velocity correlation in the wake serves as the complete inflow condition for the turbulence ingestion problem. The structure of the cylinder wa

## Process documents

In [4]:
# process documents
source_path = "/Users/waingram/Desktop/gorbid_fulltext/theses/"
dest_path = "/Users/waingram/Google Drive File Stream/Team Drives/cs5984_etd_team16/autoExtraction_sample_thesis/thesis/"

pathlist = Path(source_path).glob('*')
extractedDoc = {}
for path in pathlist:
    if not path.is_dir():
        continue  # skip .DS_Store, etc
    dIn = str(path)
    docId = dIn.split("/")[-1]
    # print(docId)
    cnt = 0
    for subPath in path.glob('*.xml'):
        docJson = extract_ch_json(str(subPath), "thesis", docId)
        # print(docJson)
        if docJson != 0:
            if Path(dest_path + docId + '.json').exists():  # sometimes more than one xml file exists
                cnt = cnt + 1
                docId = docId.split('_')[-1]
                docId = docId + '_' + str(cnt)
            with open(dest_path + docId + '.json', 'w') as outfile:
                json.dump(docJson, outfile)
        # print(subPath)

In [1]:
# test produced result 
test_json_data = open('/mnt/6t/VT_ETDs/golden_standards/team16/processed/gorbid_fulltext/extracted/thesis/17336.json').read()
test_data = json.loads(test_json_data)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/6t/VT_ETDs/golden_standards/team16/processed/gorbid_fulltext/extracted/thesis/17336.json'