In [1]:
from tika import parser
import csv
import glob
import os
import re
import sys
import pandas as pd

input_file = "./data/Arby/40721_Arby_Bottom_2017-12-31.pdf"

input_path = sys.argv[1]

def create_df(pdf_content, page_pattern, content_pattern, line_pattern, column_headings):
    """Create a Pandas DataFrame from lines of text in a PDF.

    Arguments:
    pdf_content -- all of the text Tika parses from the PDF
    page_pattern -- a pattern that identifies the page needed
    content_pattern -- a pattern that identifies the table
    line_pattern -- a pattern that separates the category name or values
    column_headings -- the list of column headings for the DataFrame
    """
    list_of_line_items = []
    # Filter the page to get year to date
    page_match = re.search(page_pattern, pdf_content, re.DOTALL)
    
    # Content match to get the table
    content_match = re.search(content_pattern, page_match.group(1), re.DOTALL)
    content_match = content_match.group(1)
    
    # Split on newlines to create a sequence of strings
    content_match = content_match.split('\n')
    

    for item in content_match:
        line_items = []
        # Use line_pattern to separate the category and values to group 1 and group 2
        line_match = re.search(line_pattern, item, re.I)
        # Grab the agency name or revenue source, strip whitespace, and remove commas
        category = line_match.group(1).strip().replace(' ', '')
        
        # Grab the dollar values, strip whitespace, remove $s, ), and commas, and replace ( with -
        values_string = line_match.group(2).strip().\
        replace('$', '').replace(',', '').replace('(', '-').replace(')', '')
        
        line_items.append(category)
        line_items.extend(values_string.split())
        
        list_of_line_items.append(line_items)
        
    # Convert to dataframe with headings
    df = pd.DataFrame(list_of_line_items, columns=column_headings)
    return df

# grab the correct page information
page_pattern = r'(the Year.*)(Proprietary Confidential Business Information)'
content_pattern = r'( Opening Equity.*?)+\nRemaining Commitment'
value_pattern = r'([a-z, ]+)([$,\(,\), \., 0-9 -]+)'

# Column headings for Dataframe
columns = ['Category', 'Total Fund', 'Investor Allocation']



# Use Tika to parse the PDF
parsedPDF = parser.from_file(input_file)

# Extract the text content from the parsed PDF
pdf = parsedPDF["content"]

# Convert double newlines into single newlines
pdf = pdf.replace('\n\n', '\n').replace('Gain/(Loss)', 'GainLoss').replace('46_1274', '')
print(pdf)
# Create a Pandas DataFrame from the lines of text in the  table in the PDF
expense_df = create_df(pdf, page_pattern, content_pattern, value_pattern, columns)
print(expense_df)


















Bottom Investor LLC
Capital Account Statement - GAAP Basis
For the Quarter ended December 31, 2017
Investor: The Northwestern Mutual Life Insurance Company
Total Fund Investor's Allocation
10/01/2017 Opening Equity $                  57,229,248.1 $        858,438.72
Total Contributions 68,513,344.47 1,027,700.17
Total Distributions (18,152.85) (272.29)
Net GainLoss Components:
Realized GainLoss 19,473,103.99 292,096.56
Professional Fees (131,812.29) (1,977.18)
Other Expenses (3,531.03) (52.97)
Change in Unrealized 6,671,162.63 100,067.44
Net GainLoss Total 26,008,923.3 390,133.85
Idle Funds Interest Income 4.5 0.07
Equity Transfer 8.43 0.13
12/31/2017 Closing Equity $                  151,733,375.95 $        2,276,000.64
Monitor Hamilton Lane
monitor@hamiltonlane.com
Proprietary Confidential Business Information

Bottom Investor LLC
Capital Account Statement - GAAP Basis
For the Year ended December 31, 2017
Investor: The Northwestern Mutual Life Insurance Company
Total F