In [22]:
import os
import re

from bs4 import BeautifulSoup
import pyparsing as pp
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
def scrape_tables(page_start,page_end,pages):
    final_table = []
    for page_num in range(page_start-1,page_end):
        header = 'Workplace Address County Investigation start date Most recent onset Total cases '
        raw = pages[page_num].text.replace('\n','')
        #outbreaks = raw[(raw.find(header) + len(header)):]
        outbreaks = raw.split(header)
        if len(outbreaks) >= 1:
            #print(page_num)
            for outbreak in outbreaks:
                if 'Table 5.' in outbreak:
                    outbreak = outbreak[:outbreak.find('Table 5.')]
                table = pp.Group(pp.OneOrMore(pp.Word(pp.alphas) + pp.pyparsing_common.iso8601_date + pp.pyparsing_common.iso8601_date + pp.pyparsing_common.integer))
                workplace_start = 0
                for result, start, end in table.scanString(outbreak):
                    for raw_row in result:
                        find = " ".join([str(r) for r in raw_row.asList()])
                        workplace_end = outbreak.find(find)
                        workplace = outbreak[workplace_start:workplace_end]
                        row = [workplace]
                        row.extend(raw_row)
                        final_table.append(row)
                        workplace_start = outbreak.find(find) + len(find)
        else:
            print(page_num)
            print(outbreaks)
    return final_table


def last_updated(page):
    content = page.text
    keyphrase = 'Data for this week are up to date as of'
    phrase_start = content.find(keyphrase) + len(keyphrase)
    
    first_digit = re.search(r"\d", content[phrase_start:])
    assert(first_digit)
    first_period = content[(phrase_start + first_digit.start()):].find(".")
    first_space = content[(phrase_start + first_digit.start()):].find(" ")
    phrase_end = phrase_start + (first_digit.start() + min(first_period,first_space))
    phrase = content[phrase_start:phrase_end].strip()
    print(phrase)
    return phrase

In [39]:
folder = '../data/oregon-xml/'
full_data = pd.DataFrame()
for fn in [f for f in os.listdir(folder) if f.endswith('.xml')]:
    with open(f"{folder}{fn}", 'r') as xmlfile:
        contents = xmlfile.read()
    soup = BeautifulSoup(contents)
    pages = soup.find_all('div',class_="page")
    
    first_page = pages[0]
    grafs = first_page.find_all('p')

    indice = [ind for ind, graf in enumerate(grafs) if graf.text.startswith('Table 3')][0]

    start_page = int(grafs[indice].text.split('.')[-1].strip())
    end_page = int(grafs[indice + 2].text.split('.')[-1].strip())
    
    intro_indice = [ind for ind, graf in enumerate(grafs) if 'Introduction' in graf.text][0]
    intro_page = int(grafs[intro_indice].text.split('.')[-1].strip()) - 1
    as_of = last_updated(pages[intro_page])
    
    report_date = fn[fn.find('2021'):fn.find('-FINAL.xml')]
    print(">>>>>> ",report_date, f"total pages in doc: {len(pages)}, start page: {start_page}, end page: {end_page}")
    tt = scrape_tables(start_page,end_page,pages)
    df = pd.DataFrame(tt,columns=['workplace_address','county','investigation_start_date','most_recent_onset','total_cases'])
    df['report_date'] = report_date
    df['last_updated'] = as_of
    full_data = pd.concat([full_data, df])

0
Sunday, May 16
>>>>>>  2021-5-19 total pages in doc: 120, start page: 44, end page: 55
0
Sunday, 
October 10
>>>>>>  2021-10-27 total pages in doc: 157, start page: 60, end page: 70
0
Sunday, August 15
>>>>>>  2021-08-18 total pages in doc: 72, start page: 51, end page: 61
0
Sunday, April 18
>>>>>>  2021-4-21 total pages in doc: 95, start page: 42, end page: 51
0
Sunday, June 27
>>>>>>  2021-6-30 total pages in doc: 128, start page: 46, end page: 53
0
Sunday, April 11
>>>>>>  2021-4-14 total pages in doc: 90, start page: 41, end page: 50
0
Sunday, August 29
>>>>>>  2021-09-01 total pages in doc: 82, start page: 55, end page: 68
0
Sunday, May 9
>>>>>>  2021-5-12 total pages in doc: 115, start page: 44, end page: 54
0
Sunday, September 12
>>>>>>  2021-09-15 total pages in doc: 109, start page: 59, end page: 75
0
Sunday, October 10
>>>>>>  2021-10-13 total pages in doc: 177, start page: 65, end page: 79
0
Sunday, May 23
>>>>>>  2021-05-26 total pages in doc: 124, start page: 44, end pag

In [40]:
latest_report = full_data['report_date'].apply(pd.to_datetime).max().strftime('%Y-%m-%d')
full_data.to_csv(f'../output/oha-data-{latest_report}.csv',index=False)