In [8]:
from bs4 import BeautifulSoup
import re
import json

In [9]:
def get_item_list(raw_text):
    tups = []
    pattern = re.compile(r'([0-9]+[a-z]?\.\s+[0-9]{4}\..*)')
    sections = re.split(pattern, raw_text)
    sections = sections[1:]

    for i in range(len(sections)):
        if i % 2 == 0:
            id = sections[i]
            continue
        else:
            lines = sections[i].split('\n\xa0\n')
            info = lines[0]

            for l in lines[1:]:
                if len(re.findall(r'[A-Z]+:', l)):
                    info += l
                else:
                    break

            tups.append((id, info))
            
    return tups

In [3]:
def parse_text(tup):
    item = tup[1]

    id_info = ' '.join(tup[0].split())
    
    meta_dict = {}
    
    # get info from ID (top of project dec)
    id_pattern = re.compile(r'([0-9]+[a-z]?)\.\s+?([0-9]+\.[0-9A-Z]+)\s+?\(([\s\S]+):\s+?(\([0-9]+\)\s?[0-9]+-[0-9]+)\)')
    groups = re.search(id_pattern, id_info).groups()
    meta_dict['NUMBER'] = groups[0]
    meta_dict['ID'] = groups[1]
    meta_dict['NAME'] = groups[2]
    meta_dict['PHONE_NUM'] = groups[3]

    # get address from project desc
    address_pattern = re.compile(r'([^a-z]+)-([\s\S]+)Preliminary Recommendation')
    meta_dict['ADDRESS'] = re.search(address_pattern, item).groups()[0].strip()
    
    # get prelim desc from bottom of desc
    prelim_rec = re.compile(r'Preliminary Recommendation:\s+(.+)')
    try:
        meta_dict['PRELIM_REC'] = re.search(prelim_rec, item).groups()[0].strip()
    except:
        meta_dict['PRELIM_REC'] = 'NA'
    
    # get continued from if exists at bottom of desc
    continued_from = re.compile(r'\(Continued from (.+)\)')
    try:
        meta_dict['CONT_FROM'] = re.search(continued_from, item).groups()[0].strip()
    except:
        meta_dict['CONT_FROM'] = 'NA'
    
    # add desc
    meta_dict['DESC'] = re.search(address_pattern, item).groups()[1].strip()
    
    # regex to find speakers
    speakers_pattern = re.compile(r'(\+|-|=)(\s+.+)')

    # iterate through rest of metadata
    metadata = re.split(r'([A-Z]+)[\(S\)]?:', item)[1:]
    for i in range(len(metadata)):
        if i % 2 == 0:
            k = metadata[i]
            continue
        else:
            if k == 'SPEAKERS':
                if k in meta_dict.keys():
                    continue
                else:
                    meta_dict[k] = re.findall(speakers_pattern, metadata[i])
            elif k == 'S':
                lines = [x.strip() for x in metadata[i].split('\n')]
                
                meta_dict['SPEAKERS'] = {'NA': []}
                s = 'NA'
                for l in lines:
                    if len(l.strip()) > 0:
                        if l.strip().startswith('('):
                            s = l.strip()
                            meta_dict['SPEAKERS'][s] = []
                        else:
                            meta_dict['SPEAKERS'][s].append(l.strip())
                        
            else:
                meta_dict[k] = metadata[i].strip()

    return meta_dict

In [12]:
import glob

all_items = []
for fname in glob.glob("../1998-2014/*.htm*"):
    with open(fname, 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html5lib')

    raw_text = soup.text


    tuple_list = get_item_list(raw_text)

    try:
        for t in tuple_list:
            t_dict = parse_text(t)
            t_dict['file_name'] = fname

            all_items.append(t_dict)
    except:
        print(fname)


In [13]:
len(all_items)

0

In [5]:
json.dump(all_items, open('test2.json', 'w'))