In [1]:
import pickle

import pandas as pd
import numpy as np

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions

import time
import random
import os


In [2]:
def set_ff_preferences():
    profile = webdriver.FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2) # custom location
    profile.set_preference('browser.download.dir', (os.getcwd()+'/data/addresses'))
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.helperApps.neverAsk.saveToDisk', "application/csv, text/csv")
    return profile

def create_empty_ff_driver():
    profile = set_ff_preferences()
    driver = webdriver.Firefox(executable_path = '/Applications/geckodriver',firefox_profile=profile)
    return driver

def create_driver_opening_url(a_url):
    driver = create_empty_ff_driver()
    driver.get(a_url)
    return driver

def clean_from_text(text):
#     cleaned_text = text.replace(",","").strip()
    cleaned_text = text.strip()
    return cleaned_text

In [3]:
def navigate_search_page(a_driver, parcel_id):
    parcel_id_rbutton = WebDriverWait(a_driver, 10).until(
    expected_conditions.presence_of_element_located(
        (By.XPATH, '//*[@id="search_radio_parcel_id"]')))

    parcel_id_rbutton.click()

    parcel_input = WebDriverWait(a_driver, 10).until(
    expected_conditions.presence_of_element_located(
        (By.XPATH, '//*[@id="parcel_number"]')))

    parcel_input.clear()
    parcel_input.send_keys(parcel_id)
    
    search_button = WebDriverWait(a_driver, 10).until(
    expected_conditions.presence_of_element_located(
        (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/form[3]/div[2]/button[1]')))

    search_button.click()

    
def navigate_parcel_page(a_driver):
    tax_distribution_button = WebDriverWait(driver, 10).until(
    expected_conditions.presence_of_element_located(
        (By.XPATH, '/html/body/div/div[4]/div/div[2]/div[1]/a[8]')))

    tax_distribution_button.click()
    
def get_gross_RE_tax(a_driver):
    gross_RE_tax = WebDriverWait(a_driver, 10).until(
        expected_conditions.presence_of_element_located(
            (By.XPATH, '/html/body/div/div[3]/div[2]/div/div[2]/div[3]/table[1]/tbody/tr[1]/td[2]'))
    ).text

    gross_RE_tax = float(gross_RE_tax.replace(',','').strip('$'))
    return gross_RE_tax

def get_delinquent_RE_tax(a_driver):
    delinquent_RE_tax = WebDriverWait(a_driver, 10).until(
        expected_conditions.presence_of_element_located(
            (By.XPATH, '/html/body/div/div[3]/div[2]/div/div[2]/div[3]/table[1]/tbody/tr[10]/td[2]'))
    ).text

    delinquent_RE_tax = float(delinquent_RE_tax.replace(',','').strip('$'))
    return delinquent_RE_tax

def navigate_to_data(a_driver, parcel_id):
        navigate_search_page(a_driver, parcel_id)
        navigate_parcel_page(a_driver)
    
def navigate_back_to_search(a_driver):
    try:
        a_driver.back()
        a_driver.back()
    except:
        a_driver.get('https://wedge1.hcauditor.org/')

def scrape_pages(a_driver, addresses_df_section):
    fails = []

    for i,row in addresses_df_section.iterrows():

        index = row.name

        try:
            navigate_to_data(a_driver, row['Parcel Number'])
            
            try:
                del_RE_tax = get_delinquent_RE_tax(a_driver)
                gross_RE_tax = get_gross_RE_tax(a_driver)

                addresses_df_section.at[index, 'delinquent RE'] = del_RE_tax
                addresses_df_section.at[index, 'gross RE tax'] = gross_RE_tax

                if gross_RE_tax == 0 or del_RE_tax == 0:
                    addresses_df_section.at[index, 'periods delinquent'] = 0.0
                else:
                    addresses_df_section.at[index, 'periods delinquent'] = np.round(del_RE_tax/gross_RE_tax, decimals = 3)

                addresses_df_section.at[index, 'processed?'] = True
            except:
                fails.append(row['Parcel Number'])
        except:
            fails.append(row['Parcel Number'])

        navigate_back_to_search(driver)
        
    return addresses_df_section,fails

def get_address_df():
    pickle_in = open(os.getcwd()+'/data/pickle_dumps/addresses.pickle', 'rb')
    addresses_df = pickle.load(pickle_in)
    pickle_in.close()

    addresses_df['delinquent RE'] = 0.0
    addresses_df['gross RE tax'] = 0.0
    addresses_df['periods delinquent'] = 0.0
    addresses_df['processed?'] = False
    
    return addresses_df

def initialize_driver():
    main_url = 'https://wedge1.hcauditor.org/'
    a_driver = create_driver_opening_url(main_url)
    return a_driver


In [4]:
addresses_df = get_address_df()

In [5]:
driver = initialize_driver()

In [6]:
section, fails = scrape_pages(driver, addresses_df.head(10))

In [16]:
addresses_df.shape

(820653, 9)

In [13]:
addresses_df.head()

Unnamed: 0,Parcel Number,Name,Address,Sale Date,Sale Price,delinquent RE,gross RE tax,periods delinquent,processed?
0,110-0001-0072-00,CASH BRET P,1 LENOX LN,1/27/2010,"$217,000",0.0,0.0,0.0,True
1,110-0001-0071-00,JACOBS JOHN & BRITTANI,2 LENOX LN,9/21/2017,"$220,000",0.0,0.0,0.0,True
2,110-0001-0069-00,KEARNEY ERIC H & JAN-MICHELE LEMON KEARNEY,3 LENOX LN,2/22/2017,,0.0,0.0,0.0,True
3,110-0001-0070-00,EVANS ALICE V,4 LENOX LN,4/23/1980,"$11,000",0.0,0.0,0.0,True
4,110-0001-0068-00,RAY ANN MARIE,5 LENOX LN,7/14/2011,"$120,000",0.0,0.0,0.0,True


In [23]:
df.shape

(28820, 9)