In [1]:
import os
import pandas as pd
import tabula
from tabula.io import read_pdf

# change working directory
os.chdir("/Users/ygemara/Downloads/West_Hempstead")

# read all pages of pdf document
file_name = "West_Hempstead.pdf"
dfs = tabula.io.read_pdf(file_name, pages='all', lattice=True)

# concatenate all tables in pdf documents as one dataframe
all_dfs=pd.concat(dfs)

# take first 13 columns (some tables had a ghost 14th column)
all_dfs=all_dfs.iloc[:,:13]

# rename the dataframe's columns (names were long and poorly formatted)
all_dfs.columns=["Property Info",
"Name and Address",
"Codes",
"Full Market Value",
"Land Assessed Value",
"Total Assessed Value",
"Exempt Code",
"Exemption Amount",
"Village Codes",
"Rate Codes",
"Tax District Percent",
"Total Taxable Value Town",
"Total Taxable Value County"]

# create one dataframe from the columns that do not need to be expanded (to be joined at the end)
all_other_columns_df=all_dfs[["Full Market Value",
"Land Assessed Value",
"Total Assessed Value",
"Exempt Code",
"Exemption Amount",
"Village Codes",
"Rate Codes",
"Tax District Percent",
"Total Taxable Value Town",
"Total Taxable Value County"]]

In [2]:
all_other_columns_df=all_dfs[["Full Market Value",
"Land Assessed Value",
"Total Assessed Value",
"Exempt Code",
"Exemption Amount",
"Village Codes",
"Rate Codes",
"Tax District Percent",
"Total Taxable Value Town",
"Total Taxable Value County"]]

## Splitting First column into separate columns

In [3]:
# create multiple columns from the first column's rows (they are read and split on "\r" character)
series=all_dfs["Property Info"].apply(lambda x:x.split("\r"))
series_df=pd.DataFrame(data=series)
series_df.columns=["First_Column"]
series_df["length"]=series_df["First_Column"].apply(lambda x:len(x))
all_series=series_df["First_Column"].tolist()

# since many cells do not have a value for Lot Grouping, insert a blank value for that place in the list (so the columns aren't distorted)
for series in all_series:
    if 'acres' in series[2]:
        series.insert(1,"")

# take first 5 items from the list (which will create 5 columns in final dataframe). The bottom of the first cell has long code we don't need.
all_series=[series[:5] for series in all_series]

# create a dtaframe of 5 columns from the original first column
first_column_df=pd.DataFrame(all_series, columns=['Section-Block-Lot', 'Lot Grouping', 'Address','Lot Size','Liber'])

## Splitting Second column into separate columns

In [4]:
# split on "\r" character and handle issue with float that cannot be split (this came up in one of the 10,000 pages)
address_series=all_dfs['Name and Address'].apply(lambda x:x.split("\r") if not(isinstance(x, float)) else ["","","",""])
address_series_df=pd.DataFrame(data=address_series)
address_series_df.columns=["Second_Column"]
address_series_df["length"]=address_series_df["Second_Column"].apply(lambda x:len(x))
all_address_series=address_series_df["Second_Column"].tolist()

# same concept as first column. In order to keep column names and values aligned, we add a blank value 
for series in all_address_series:
    if len(series)==4:
        series.insert(1,"")
    elif len(series)==5:
        series.insert(2,"")
    
all_address_series=[series[:4] for series in all_address_series]

# create a dtaframe of 4 columns from the original second column
second_column_df=pd.DataFrame(all_address_series, columns=['Name 1', 'Name 2','Street Address','City-State-Zip'])

In [5]:
# split on "\r" character 
codes_series=all_dfs['Codes'].apply(lambda x:x.split("\r"))
codes_series_df=pd.DataFrame(data=codes_series)
codes_series_df.columns=["Third_Column"]
all_codes_series=codes_series_df["Third_Column"].tolist()

# create a dtaframe of 6 columns from the original third column
third_column_df=pd.DataFrame(all_codes_series, columns=['Roll Section', 'SWIS Code',"Sch SWIS Code",'School Code','PUC - Class',"Percent Value"])



In [6]:
# merge first, second, third dataframes with the rest of the columns
final_df=pd.concat([first_column_df,second_column_df,third_column_df,all_other_columns_df.reset_index()],axis=1)

In [7]:
#write final dataframe to csv
final_df.to_csv("All_Properties.csv", index=False)