In [None]:
import pandas as pd
import numpy as np
import missingno
import logging
import time
import getpass
from tqdm import tqdm_notebook
from selenium import webdriver
from selenium.common.exceptions import NoSuchFrameException
from address_scraping import *

cp = fr'C:\Users\{getpass.getuser()}\AppData\Local\Microsoft\WindowsApps\chromedriver.exe'
%matplotlib inline

## Set up logger
logging.basicConfig(
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt='%m-%d %H:%M',
                    )
_log = logging.getLogger(name=__name__)
_log.setLevel(logging.INFO)

In [None]:
## Import dataframe
p = r'D:\LoveAkron\Orgs, Households, Addresses by Donation-2020-02-06-14-43-49.xlsx'
df = pd.read_excel(p, dtype=str)
display(df.head(5))
display(missingno.matrix(df))

In [None]:
## Drop duplicate names, since we can easily map back to them later
df['full_name'] = df['First Name'] + ' ' + df['Last Name']
df['counts'] = df['full_name'].map(df['full_name'].value_counts())
_log.debug(f"Pre-drop we had {df.shape[0]} observations")
df.drop_duplicates(subset=['full_name'], inplace=True)
_log.debug(f"Post-drop we had {df.shape[0]} observations")

In [None]:
fiscal_site = 'https://fiscaloffice.summitoh.net/index.php/property-tax-search'
test_fn = 'Turner'
test_ln = 'Anderson'
out_p = r'D:\LoveAkron\web_data\housing\\'

In [None]:
## Open a browser object
browser = webdriver.Chrome(executable_path=cp)

In [None]:
iterator = df.iterrows()
final_data = pd.DataFrame(columns=['first_name', 'last_name', 'lives_here', 'address', 'city', 'zipcode'])
i = 0

In [None]:
for row in tqdm_notebook(iterator, desc='Scraping in Progress:', total=df.shape[0]-2279):
    fn = row[1]['First Name']
    ln = row[1]['Last Name']
    _log.debug(f'Current individual: {fn} {ln}')
    if fn and ln:
        row_data = scrape_addresses(fn, ln, browser)
        if row_data:
            final_data.loc[i] = row_data
            i += 1
        time.sleep(10 + np.random.normal(12,3))

In [None]:
browser.close()

In [None]:
final_data.to_csv(f'{out_p}full_data.csv', index=False, sep='|')