In [1]:
import re, csv
import PyPDF2
import openpyxl

In [2]:
pdf_path = 'data/FY23-Tentative-Budget-Expenditures-resave.pdf'
output_file_type = 'csv'
# the relevant data starts on this line (each pdf page has a header)
data_start_line = 7

In [3]:
# read pdf file into list
raw_pages = []
with open(pdf_path, 'rb') as f:
    pdf = PyPDF2.PdfFileReader(f)
    for page in pdf.pages:
        raw_pages.append(page.extract_text())
    n_pages =  pdf.getNumPages()


Xref table not zero-indexed. ID numbers for objects will be corrected.


In [4]:
for page in pdf.pages[0:2]:
    page

In [5]:
n_pages

122

In [79]:
clean_lines = []
for i, page in enumerate(raw_pages):
    this_page = page.splitlines(keepends=True)
    for line in this_page:
        clean_line = line.replace('\xa0', ' ')
        clean_line = clean_line.strip()
        clean_lines.append(clean_line)

In [97]:
clean_lines[0:30]

['3frbud12.p 76-4 Urbana, IL 06/15/22 Page:1',
 '05.22.02.00.00 TENTATIVE BUDGET-EXPENSES  (Date: 6/2022)  9:52 AM',
 'Account Level 2022-23 2021-22 2021-22 2020-21 2020-21',
 'FDTLOC FUNC OBJ  SJ',
 'Description                           TENTATIVE   Original Budget     FYTD Activity   Original Budget     FYTD Activity',
 '10',
 'EDUCATION',
 '000000',
 'UNRESTRICTED',
 '10E000 0000 0000 00 000000',
 'SUMMARY ACCOUNT OFFSET',
 '',
 '',
 '-40.00',
 '',
 '4.34',
 '10E000 0000 1230 00 000000',
 '',
 '',
 '245.00',
 '',
 '754.50',
 '10E000 0000 1270 00 000000',
 '',
 '',
 '220.00',
 '',
 '',
 '10E000 0000 2160 00 000000',
 '']

In [117]:
# PDF's may be given with different numbers of budget (dollar) columns
n_budget_cols = 5
# 5 account number columns + 1 account description + N budget columns
MAX_BUDGET_ENTRY_COLS = 5 + 1 + n_budget_cols

funds = {}
accounts = {}

# first line of new page = any chars + Urbana,IL + date + Page:# + end of line
new_page_pattern = '.*Urbana, IL \d\d/\d\d/\d\d Page:\d{1,3}$'

i = 0
while i < len(clean_lines):
    current_line = clean_lines[i]
    if re.match(new_page_pattern, current_line):
        i += 1
        print(f'HEADER: {current_line}')
    # line 2 of header = start of string + 10 digits separated by decimals
    elif re.match('^\d\d\.\d\d\.\d\d\.\d\d\.\d\d', current_line):
        i += 1
        print(f'HEADER: {current_line}')
    # line 3 of header = begins with "Account Level"
    elif re.match('^Account Level', current_line):
        i += 1
        print(f'HEADER: {current_line}')
    # line 4 of header = begins with "FDTLOC"
    elif re.match('^FDTLOC', current_line):
        i += 1
        print(f'HEADER: {current_line}')
    # line 5 of header = begins with "Description"
    elif re.match('^Description', current_line):
        i += 1
        print(f'HEADER: {current_line}')
    # fund number = start of string + two digit number + end of string
    elif re.match('^\d{2}$', current_line):
        # this line is the fund number, next line is the fund name
        funds[current_line] = clean_lines[i+1]
        i += 2
        print(f'FUND number:{current_line} name:{funds[current_line]}')
    # account number = start of string + six digit number + end of string
    elif re.match('^\d{6}$', current_line):
        # this line is the account number, next line is the account name
        accounts[current_line] = clean_lines[i+1]
        i += 2
        print(f'ACCOUNT number:{current_line} name:{accounts[current_line]}')
    # budget line item - ##E### ####
    elif re.match('^\d{2}E\d{3} \d{4}', current_line):
        # account numbers
        budget_line = current_line.split()
        # if the next line is empty or numeric, the account description is blank in the PDF
        if not clean_lines[i+1] or clean_lines[i+1].isnumeric():
            budget_line.append('BLANK')
        # continue adding budget line entries until we reach either
        #    1) the next budget line item
        #    2) the next page
        #    3) the maximum number of columns (helps with edge cases with subtotals)
        while not (re.match('^\d{2}E\d{3} \d{4}', clean_lines[i+1]) or 
                   re.match(new_page_pattern, clean_lines[i+1])):
            i += 1
            budget_line.append(clean_lines[i])
            if len(budget_line) == MAX_BUDGET_ENTRY_COLS:
                break
        i += 1
        print('ENTRY:', budget_line)
    else:
        i += 1
        print(f'SKIPPING: {current_line}')

SKIPPING: 
SKIPPING: 352.47
SKIPPING: 
SKIPPING: 
HEADER: 3frbud12.p 76-4 Urbana, IL 06/15/22 Page:116
HEADER: 05.22.02.00.00 TENTATIVE BUDGET-EXPENSES  (Date: 6/2022)  9:52 AM
HEADER: Account Level 2022-23 2021-22 2021-22 2020-21 2020-21
HEADER: FDTLOC FUNC OBJ  SJ
HEADER: Description                           TENTATIVE   Original Budget     FYTD Activity   Original Budget     FYTD Activity
FUND number:50 name:IMRF
ACCOUNT number:780000 name:CCLC-2013
ENTRY: ['50E000', '2620', '2130', '00', '784421', 'BLANK', '', '', '396.92', '', '']
ENTRY: ['50E000', '2620', '2140', '00', '784421', 'BLANK', '', '', '136.20', '', '']
ENTRY: ['50E003', '2900', '2140', '00', '784421', 'BLANK', '', '', '120.80', '', '']
ENTRY: ['50E010', '2900', '2140', '00', '784421', 'BLANK', '', '', '72.53', '', '']
SKIPPING: 
SKIPPING: 
SKIPPING: 
SKIPPING: 
SKIPPING: _________________
SKIPPING: 
SKIPPING: 
SKIPPING: 50 --- ---- ----    78----
SKIPPING: 
SKIPPING: 
SKIPPING: 2,401.73
SKIPPING: 
SKIPPING: 
SKIPPING: 

In [12]:
len(clean_lines)

28229

In [21]:
clean_lines[28150:28228]

 '90',
 'L/S LEVY',
 '000000',
 'UNRESTRICTED',
 '90E008 2542 5202 00 000000',
 'UMS 2020 L/S AMEND #21',
 '',
 '',
 '',
 '112,000.00',
 '113,014.48',
 '90E004 2542 5203 00 000000',
 'WILEY 2020 L/S AMEND #23',
 '',
 '',
 '2,593.00',
 '35,000.00',
 '31,772.23',
 '90E008 2542 5203 00 000000',
 'UMS 2022 L/S AMEND #22 & #23',
 '',
 '',
 '9,552.97',
 '',
 '',
 '90E011 2542 5209 00 000000',
 'TP 2021 L/S AMEND #16',
 '515,000',
 '515,000.00',
 '1,980.00',
 '',
 '',
 '',
 '',
 '________________',
 '_________________',
 '_________________',
 '_________________',
 '_________________',
 '90 --- ---- ----    00----',
 '515,000',
 '515,000.00',
 '14,125.97',
 '147,000.00',
 '144,786.71',
 '',
 '',
 '',
 '',
 '________________',
 '_________________',
 '_________________',
 '_________________',
 '_________________',
 '90 --- ---- ----    ------',
 '515,000',
 '515,000.00',
 '14,125.97',
 '147,000.00',
 '144,786.71',
 '',
 '',
 '______________________________________________________________________