In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup, UnicodeDammit
import pandas as pd
import numpy as np
import import_madison_health as mad

### Basic Selenium Interface:

In [264]:
path_to_chromedriver = '/Users/tracy/Desktop/chromedriver' # change path as needed
browser = webdriver.Chrome(executable_path = path_to_chromedriver)

In [265]:
#navigate to main page
search_term='a'
url = 'https://elam.cityofmadison.com/HealthInspections/Default.aspx?AcceptsCookies=1'
browser.get(url)

search_content = browser.find_element_by_id('MainContent_txtSearchEstablishment')
search_content.clear()
search_content.send_keys(search_term)

# press search button
browser.find_element_by_id('MainContent_btnSearch').click()

## Design waterfall of search terms that will capture all Madison Restaurants:

In [187]:
# Download restaurant level info for all specified searches:
rest_list2 = []
search_list2 = ['a','e','i','o','u','y','b','c','d','f','g','h','j','k','l','m',
                'n','p','q','r','s','t','v','w','x','z',
                '608']

# The 'seen' list documents all restaurants that have already been captured 
# via prior search terms (and which should therefore be ignored/not counted)
seen2 = set()
for term in search_list2:
    restaurants = mad.search_restaurants(browser, term, seen2, False)
    rest_list2.append(restaurants)
    seen2.update(set(restaurants.keys()))
    print 'Term: %s    # New Restaurants: %d' % (term, len(restaurants))

Term: a    # New Restaurants: 1823
Term: e    # New Restaurants: 316
Term: i    # New Restaurants: 113
Term: o    # New Restaurants: 61
Term: u    # New Restaurants: 6
Term: y    # New Restaurants: 0
Term: b    # New Restaurants: 0
Term: c    # New Restaurants: 0
Term: d    # New Restaurants: 3
Term: f    # New Restaurants: 0
Term: g    # New Restaurants: 0
Term: h    # New Restaurants: 0
Term: j    # New Restaurants: 0
Term: k    # New Restaurants: 0
Term: l    # New Restaurants: 0
Term: m    # New Restaurants: 0
Term: n    # New Restaurants: 0
Term: p    # New Restaurants: 0
Term: q    # New Restaurants: 0
Term: r    # New Restaurants: 0
Term: s    # New Restaurants: 0
Term: t    # New Restaurants: 0
Term: v    # New Restaurants: 0
Term: w    # New Restaurants: 0
Term: x    # New Restaurants: 0
Term: z    # New Restaurants: 0
Term: 608    # New Restaurants: 1


#### As can be seen above, the vowel searches capture all but 4 Madison-area restaurants. 
"Jd's" and "608" are the only special cases that need to be added to the vowel search list

In [245]:
rest_list = []
search_list = ['a','e','i','o','u','y',"Jd's",'608']
seen = set()

# Grab restaurant-level information for all search terms in search_list
for term in search_list:
    restaurants = mad.search_restaurants(browser, term, seen, False)
    rest_list.append(restaurants)
    seen.update(set(restaurants.keys()))
    print 'Term: %s    # New Restaurants: %d' % (term, len(restaurants))
    
# Split 'a' search into 5 chunks (for parallelization)
# This is doen by making 5 'seen' files for the 'a' search, where 
# 4/5 of the id's are in each 'seen' file
r = rest_list[0].keys()
a_list = [r[i:i+365] for i in xrange(0,len(r),365)]
for i in xrange(len(a_list)):
    save_to_pickle([x for x in r if x not in a_list[i]], 'madison_a_%d.pkl' % i)
   
# Write out 'seen' files, which describe restaurant id's that have already been downloaded
# by previous searches. This step allows for parallelization of the final download process
r = rest_list[0]
save_to_pickle(r, '../yelp-health/data/mad/madison_a_full.pkl')
r.update(rest_list[1])
save_to_pickle(r.keys(), '../data/mad/madison_e_full.pkl')
r.update(rest_list[2])
save_to_pickle(r.keys(), '../data/mad/madison_i_full.pkl')
r.update(rest_list[3])
save_to_pickle(r.keys(), '../data/mad/madison_o_full.pkl')
r.update(rest_list[4])
save_to_pickle(r.keys(), '../data/mad/madison_u_full.pkl')
r.update(rest_list[6])
save_to_pickle(r.keys(), '../data/mad/madison_Jd_full.pkl')
r.update(rest_list[7])
save_to_pickle(r.keys(), '../data/mad/madison_608_full.pkl')

Term: a    # New Restaurants: 1823
Term: e    # New Restaurants: 316
Term: i    # New Restaurants: 113
Term: o    # New Restaurants: 61
Term: u    # New Restaurants: 6
Term: y    # New Restaurants: 0
Term: Jd's    # New Restaurants: 3
Term: 608    # New Restaurants: 1


## Exploratory Analysis

In [4]:
R_2 = mad.open_pickle('../data/mad/mad_health_2.pkl')

In [5]:
df_2 = pd.DataFrame.from_dict(R_2).T
df_2.head()

Unnamed: 0,address,inspections,name,type
MainContent_10143-2421,"2505 MONROE ST\nMADISON, WI 53711",{u'MainContent_2349540': {u'date': u'2/5/2013'...,Laurel Tavern,Primarily Restaurant
MainContent_1269-798,"4402 E WASHINGTON AVE\nMADISON, WI 53704",{u'MainContent_2390670': {u'date': u'6/27/2013...,Crowne Plaza Madison,Primarily Restaurant
MainContent_1580-1041,"22 S CARROLL ST\nMADISON, WI 53703",{u'MainContent_2587815': {u'date': u'6/12/2015...,Inn On The Park,Primarily Restaurant
MainContent_1811-1189,"554 W MAIN ST\nMADISON, WI 53703",{u'MainContent_2624458': {u'date': u'10/2/2015...,Echo Tap & Grill,Primarily Restaurant
MainContent_18705-4236,"523 STATE ST\nMADISON, WI 53703",{u'MainContent_2581271': {u'date': u'4/28/2015...,Mondays,Establisment Type not defined
