In [None]:
# combine the scraping involved in opening and sorting pickled data 
# (getting the last transfer date from transfers)

In [1]:
import pickle

import pandas as pd
import numpy as np

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions

import time
import random
import os


In [2]:
def set_ff_preferences():
    profile = webdriver.FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2) # custom location
    profile.set_preference('browser.download.dir', (os.getcwd()+'/data/addresses'))
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.helperApps.neverAsk.saveToDisk', "application/csv, text/csv")
    return profile

def create_empty_ff_driver():
    profile = set_ff_preferences()
    driver = webdriver.Firefox(executable_path = '/Applications/geckodriver',firefox_profile=profile)
    return driver

def create_driver_opening_url(a_url):
    driver = create_empty_ff_driver()
    driver.get(a_url)
    return driver

def clean_from_text(text):
#     cleaned_text = text.replace(",","").strip()
    cleaned_text = text.strip()
    return cleaned_text

In [3]:
def initialize_driver():
    main_url = 'https://wedge1.hcauditor.org/'
    a_driver = create_driver_opening_url(main_url)
    return a_driver

def navigate_search_page(a_driver, parcel_id):
    parcel_id_rbutton = WebDriverWait(a_driver, 10).until(
    expected_conditions.presence_of_element_located(
        (By.XPATH, '//*[@id="search_radio_parcel_id"]')))

    parcel_id_rbutton.click()

    parcel_input = WebDriverWait(a_driver, 10).until(
    expected_conditions.presence_of_element_located(
        (By.XPATH, '//*[@id="parcel_number"]')))

    parcel_input.clear()
    parcel_input.send_keys(parcel_id)
    
    search_button = WebDriverWait(a_driver, 10).until(
    expected_conditions.presence_of_element_located(
        (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/form[3]/div[2]/button[1]')))

    search_button.click()

def navigate_to_distributions(a_driver):
    tax_distribution_button = WebDriverWait(driver, 10).until(
    expected_conditions.presence_of_element_located(
        (By.XPATH, '/html/body/div/div[4]/div/div[2]/div[1]/a[8]')))

    tax_distribution_button.click()
    
def navigate_to_data(a_driver, parcel_id):
        navigate_search_page(a_driver, parcel_id)
        navigate_to_distributions(a_driver)
        
def get_address(a_driver):
    address = WebDriverWait(a_driver, 10).until(
        expected_conditions.presence_of_element_located(
            (By.XPATH, '/html/body/div/div[3]/div[1]/div[2]'))
    ).text
    
    return address[8:]

def get_land_value(a_driver):
    land_value = WebDriverWait(a_driver, 10).until(
        expected_conditions.presence_of_element_located(
            (By.XPATH, '/html/body/div/div[3]/div[2]/div/div[2]/div[1]/table/tbody/tr[1]/td[2]'))
    ).text

    land_value = float(land_value.replace(',','').strip('$'))
    return land_value

def get_building_value(a_driver):
    building_value = WebDriverWait(a_driver, 10).until(
        expected_conditions.presence_of_element_located(
            (By.XPATH, '/html/body/div/div[3]/div[2]/div/div[2]/div[1]/table/tbody/tr[1]/td[2]'))
    ).text

    building_value = float(building_value.replace(',','').strip('$'))
    return building_value
    
def get_gross_RE_tax(a_driver):
    gross_RE_tax = WebDriverWait(a_driver, 10).until(
        expected_conditions.presence_of_element_located(
            (By.XPATH, '/html/body/div/div[3]/div[2]/div/div[2]/div[3]/table[1]/tbody/tr[1]/td[2]'))
    ).text

    gross_RE_tax = float(gross_RE_tax.replace(',','').strip('$'))
    return gross_RE_tax

def get_delinquent_RE_tax(a_driver):
    delinquent_RE_tax = WebDriverWait(a_driver, 10).until(
        expected_conditions.presence_of_element_located(
            (By.XPATH, '/html/body/div/div[3]/div[2]/div/div[2]/div[3]/table[1]/tbody/tr[10]/td[2]'))
    ).text

    delinquent_RE_tax = float(delinquent_RE_tax.replace(',','').strip('$'))
    return delinquent_RE_tax

def navigate_back_to_search(a_driver):
    try:
        a_driver.back()
        a_driver.back()
    except:
        a_driver.get('https://wedge1.hcauditor.org/')

def scrape_pages(a_driver, parcel_df_section):
    fails = []

    for i,row in parcel_df_section.iterrows():
        index = row.name

        try:
            navigate_to_data(a_driver, row['parcel_number'])
            
            try:
                parcel_df_section.at[index, 'address'] = get_address(a_driver)
            except:
                None
                
            try:
                parcel_df_section.at[index, 'land_value'] = get_land_value(a_driver)
                parcel_df_section.at[index, 'building_value'] = get_building_value(a_driver)
            except:
                None
                
            try:
                del_RE_tax = get_delinquent_RE_tax(a_driver)
                gross_RE_tax = get_gross_RE_tax(a_driver)

                parcel_df_section.at[index, 'delinquent_re_tax'] = del_RE_tax
                parcel_df_section.at[index, 'gross_re_tax'] = gross_RE_tax

                if gross_RE_tax == 0 or del_RE_tax == 0:
                    parcel_df_section.at[index, 'periods_delinquent'] = 0.0
                else:
                    parcel_df_section.at[index, 'periods_delinquent'] = np.round(del_RE_tax/gross_RE_tax, decimals = 3)

            except:
                fails.append(row['parcel_number'])
        except:
            fails.append(row['parcel_number'])

        navigate_back_to_search(driver)
        
    return parcel_df_section,fails




In [4]:
unpaid_df = pd.read_excel('./data/unpaid.xlsx')

unpaid_df['address'] = ''
unpaid_df['delinquent_re_tax'] = 0.0
unpaid_df['gross_re_tax'] = 0.0
unpaid_df['periods_delinquent'] = 0.0
unpaid_df['land_value'] = 0.0
unpaid_df['building_value'] = 0.0


In [5]:
batches = np.array_split(unpaid_df, 290)

pickle_out = open(os.getcwd()+'/data/pickle_dumps/batches.pickle', 'wb')
pickle.dump(batches,pickle_out)
pickle_out.close()

In [6]:
batch_num_fail = []
batch_scrape_fails = []
ind_scrape_fails = []

scraped_parcels_batches = []

for index,batch in enumerate(batches):
    try:
        driver = initialize_driver()
        scraped_parcels_df, fails = scrape_pages(driver, batch)
        ind_scrape_fails.append(fails)
        scraped_parcels_batches.append(scraped_parcels_df)
        driver.close()
        
        pickle_out = open(os.getcwd()+'/data/pickle_dumps/batch'+str(index)+'.pickle', 'wb')
        pickle.dump(batch,pickle_out)
        pickle_out.close()
        
    except:
        batch_num_fail.append(index)
        batch_scrape_fails.append(batch)
    

In [7]:
if len(batch_num_fail) > 0:
    pickle_out = open(os.getcwd()+'/data/pickle_dumps/batch_num_fail.pickle', 'wb')
    pickle.dump(batch_num_fail,pickle_out)

if len(batch_scrape_fails) > 0:
    pickle_out = open(os.getcwd()+'/data/pickle_dumps/batch_scrape_fails.pickle', 'wb')
    pickle.dump(batch_scrape_fails,pickle_out)

if len(ind_scrape_fails) > 0:
    pickle_out = open(os.getcwd()+'/data/pickle_dumps/batches.pickle', 'wb')
    pickle.dump(batches,pickle_out)

pickle_out.close()

In [8]:
pickle_out.close()

In [9]:
batch_num_fail

[]

In [10]:
ind_scrape_fails

[[],
 [],
 [],
 ['0240001006300'],
 ['0270002003400'],
 [],
 [],
 [],
 [],
 [],
 ['0370001038700', '0370002037500', '0370003022100'],
 [],
 [],
 [],
 [],
 ['0460008000700', '0500002004200'],
 ['0500007033000'],
 ['0530001018500'],
 ['0540004003200'],
 ['0540005008100', '0540006003600', '0550001002700'],
 ['0550002007300', '0550004006000'],
 ['0560004001700', '0570003005100'],
 ['0570006010500', '0580003006200'],
 ['0580005007200', '0580006002900', '0590006014600'],
 ['0600003010200'],
 ['0620001011000', '0630001002900'],
 ['0640002004000', '0650001002100'],
 ['0650003008900', '0660002012300', '0660003005500'],
 ['0670001017000', '0670002014200', '0680001027900'],
 ['0680002030800', '0680003004600', '0690001017200'],
 ['0700001004500', '0700002015490', '0730002024100'],
 ['0750004027800', '0770002025100', '0790004000600'],
 ['0810002014200', '0810004032300'],
 ['0850003000200', '0870004008000'],
 ['0870006002800', '0890001001100'],
 ['0900002000300', '0900003009200'],
 ['0920004004600']

In [11]:
scrape_fails = []

for i in ind_scrape_fails:
    for j in i:
        scrape_fails.append(j)

In [13]:
len(scrape_fails)

370