In [31]:
import pandas as pd
import numpy as np
import requests as requests
from IPython.display import display, HTML
from bs4 import BeautifulSoup, UnicodeDammit
import StringIO
import logging
import time
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [29]:
def save_to_pickle(data, fname):
    with open(fname, 'wb') as handle:
        pickle.dump(data, handle)


def open_pickle(f_name):
    with open(f_name, 'rb') as f:
        data = pickle.load(f)
    return data

In [2]:
def single_query(link):
    response = requests.get(link)
    if response.status_code != 200:
        print 'WARNING', response.status_code
    else:
        return response.json()

http://www.maricopa.gov/EnvSvc/OnlineApplication/EnvironmentalHealth/BusinessSearchResults?page=2185
http://www.maricopa.gov/EnvSvc/OnlineApplication/EnvironmentalHealth/BusinessSearchResults?page=2186
http://www.maricopa.gov/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-99-0014&i=0

In [164]:
### 'http://www.maricopa.gov/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00001&i=0'
maricopa = 'http://www.maricopa.gov/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00001&i=0'

In [166]:
r = requests.get(maricopa, params=None)

INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.maricopa.gov
DEBUG:requests.packages.urllib3.connectionpool:"GET /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00001&i=0 HTTP/1.1" 200 14484


In [None]:
payload = {'p': x, 'i': '0'}
r = requests.get(maricopa, params=payload)

In [10]:
## helper functions:
def setup_logger(logger_name, log_file, level=logging.INFO):
    l = logging.getLogger(logger_name)
    formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s')
    fileHandler = logging.FileHandler(log_file, mode='w')
    fileHandler.setFormatter(formatter)
    #streamHandler = logging.StreamHandler()
    #streamHandler.setFormatter(formatter)

    l.setLevel(level)
    l.addHandler(fileHandler)
    #l.addHandler(streamHandler)  

def single_query(link, payload, s=None):
    if s is None:
        response = requests.get('http://www.maricopa.gov'+link, params=payload)
    else:
        response = s.get('http://www.maricopa.gov'+link, params=payload)
    return BeautifulSoup(response.content, from_encoding='UTF-8')

## Functions that directly pull results from PHX website:
def access_results_page(page_no, s=None):
    # convert results page to beautiful soup content:
    link = '/EnvSvc/OnlineApplication/EnvironmentalHealth/BusinessSearchResults'
    payload = {'page':str(page_no)}
    soup = single_query(link, payload, s)

    # Get list of all rows entries (restaurants) on this page
    t = soup.findAll('div',attrs={'class':'col-xs-12 Row regularText'})

    # Extract table info for each restaurant
    # [permit_id, business name, address, cutting edge participant]
    rows = [[y.text for y in x.findAll('div')] for x in t]
    
    # Extract link to each restaurant's page
    permit_links = [[y['href'] for y in x.findAll('a')][0] for x in t]
    
    return permit_links, rows #permit_ids, names

def access_restaurant_page(restaurant_link, s=None):
    # convert restaurant page to beautiful soup content:
    soup = single_query(restaurant_link, None, s)
    
    # Get list of all rows entries (inspections) on this page
    t = soup.findAll('div',attrs={'class':'col-xs-12 Row regularText'})
    
    # Extract link to each inspection's detailed report
    inspection_links = [[y['href'] for y in x.findAll('a')][0] for x in t]
    
    # Extract summary details for each report: 
    # [Date, purpose, grade, priority violation, cutting edge participant]
    inspection_summary = [[y.text for y in x.findAll('div')] for x in t]
    
    return inspection_links, inspection_summary

def access_inspection_page(inspection_link, s=None):
    # convert inspection page to beautiful soup content:
    soup = single_query(inspection_link, None, s)
    
    # Get list of all rows entries (inspections) on this page
    t = soup.findAll('div',attrs={'class':'col-xs-12 Row regularText'})
    
    rows = [[y.text for y in x.findAll('div')] for x in t]
    
    # Extract inspection summary info
    # [grade, priority violation, cutting edge participant]
    inspection_summary = rows[0]
    
    # Extract violation details: 
    # [violation id, violation description, violation comments, correct by]
    violations = rows[1:] if len(rows) > 1 else []
    
    # Extract inspection comments:
    comments = soup.findAll('p',attrs={'class':'col-xs-12 Row regularText'})[0].text
    
    return inspection_summary, violations, comments

## Helper functions that try to access the PHX website
## they report if the connection failed and can be called again
## the ct argument allows you to specify a pause before trying again 
def get_single_result_page(pg, ct, s=None):
    logR = logging.getLogger('logR')
    try: 
        logR.info('Accessing page: %d', pg)
        # Extract link to inspections and general restaurant info:
        permit_links, rest_info = access_results_page(pg, s)
        rest_info = np.array(rest_info)
        
        return pd.DataFrame.from_dict({'link': permit_links,
                                       'permit_id': rest_info[:,0], 
                                       'name': rest_info[:,1],
                                       'address': rest_info[:,2],
                                       'cutting_edge': rest_info[:,3]}), None 
    except Exception as e:
        logR.warning('[%02d] Failed to load: %d --> exception: %s', ct, pg, e)
        time.sleep(ct)
        return None, str(e)

def get_single_restaurant(link, ct, s=None):
    logI = logging.getLogger('logI')
    try: 
        logI.info('Accessing link: %s', link)
        # Extract inspection info:
        i_links, i_summary = access_restaurant_page(link, s)
        i_summary = np.array(i_summary)
        
        return i_links, i_summary, None 
    except Exception as e:
        logI.warning('[%02d] Failed to load: %s --> exception: %s', ct, link, e)
        time.sleep(ct)
        return None, None, str(e)

def get_single_inspection(link, ct, s=None):
    logI = logging.getLogger('logI')
    try: 
        logI.debug('           %s', link)
        # Extract inspection info:
        inspection_summary, violations, comments = access_inspection_page(link, s)
        violations = np.array(violations)

        return inspection_summary, violations, comments, None
    except Exception as e:
        logI.warning('[%02d] Failed to load: %s --> exception: %s', ct, link, e)
        time.sleep(ct)
        return None, None, None, str(e)

## Functions that actually loop through a given subset of pages and store all results in a dataframe
def scrape_restaurant_data(pages, s=None, label=None):
    setup_logger('logR', 'code/logs/scrape_restaurant_%s.log' % label, level=logging.INFO)
    logR = logging.getLogger('logR')
    #logging.basicConfig(filename='logs/scrape_restaurant_%s.log' % label, filemode='w', level=logging.INFO)
    
    ct_max = 10
    ct_error = 0
    R = pd.DataFrame(columns=['permit_id', 'link', 'name', 'address', 'cutting_edge'])
    if s is None:
        s = requests.Session()
    
    # Loop through each results page:
    for pg in pages:
        ct = 0
        e = ''
        
        # Try up to 10 times to load each page. 
        # Adds a progressively longer wait period (proportional to ct) for each retry
        while (type(e) == str) and (ct < ct_max):
            R_single, e = get_single_result_page(pg, ct, s)
            ct += 1
        if type(e) == str:
            logR.error('Failed to load after %d tries. Error: %s', ct_max, e)
            ct_error += 1
        else:
            R = pd.concat([R, R_single], ignore_index=True)
    if ct_error > 0:
        logR.error('Failed to load %d pages.', ct_error)
    return R

def scrape_inspection_data(R, s=None, label=None):
    setup_logger('logI', 'code/logs/scrape_inspection_%04d.log' % label, level=logging.INFO)
    logI = logging.getLogger('logI')
    #logging.basicConfig(filename='logs/scrape_inspection_%04d.log' % label, filemode='w', level=logging.INFO)
    
    ct_max = 10
    ct_error_link = 0
    ct_error_report = 0
    
    I = pd.DataFrame(columns=['inspec_id', 'permit_id', 'link', 'date', 'grade', 'n_priority', 'cutting_edge', 'comments'])
    V = pd.DataFrame(columns=['inspec_id', 'permit_id', 'code', 'description', 'comments', 'correct_by'])
    if s is None:
        s = requests.Session()
    
    # Loop through each restaurant page:
    for link in R.link:
        i_comments = []

        ct = 0
        e = ''
        
        # Try up to 10 times to load each page.
        # Adds a progressively longer wait period (proportional to ct) for each retry
        while (type(e) == str) and (ct < ct_max):
            i_links, i_summary, e = get_single_restaurant(link, ct, s)
            ct += 1
            
            # [Date, purpose, grade, priority violation, cutting edge participant]
            info = np.array(i_summary)
        if type(e) == str:
            logI.error('Failed to load after %d tries. Error: %s', ct_max, e)
            ct_error_link += 1
        else:
            # Extract id's from url
            split_link = map(lambda x: x.replace('&','=').split('='), i_links)
            inspec_id = [x[3] for x in split_link]
            permit_id = [x[1] for x in split_link]
            
            # Loop through each individual inspection report
            for i in xrange(len(i_links)):
                ct_v = 0
                e = ''
                
                # Try up to 10 times to load each inspection report.
                # Adds a progressively longer wait period (proportional to ct_v) for each retry
                while (type(e) == str) and (ct_v < ct_max):
                    _, violations, comments, e = get_single_inspection(i_links[i], ct_v, s)
                    ct_v += 1
                
                if type(e) == str:
                    logI.error('Failed to load after %d tries. Error: %s', ct_max, e)
                    ct_error_report += 1
                else:
                    i_comments.append(comments)
                    
                    # some inspection reports do not include any violations.
                    if violations.size > 0:
                        # Append all violation information to the violation dataframe
                        V = pd.concat([V, pd.DataFrame.from_dict({'inspec_id': inspec_id[i],
                                                                  'permit_id': permit_id[i],
                                                                  'code': violations[:,0],
                                                                  'description': violations[:,1],
                                                                  'comments': violations[:,2],
                                                                  'correct_by': violations[:,3]})])

            # Append all inspection information to the inspection data frame:
            I = pd.concat([I, pd.DataFrame.from_dict({'link': i_links,
                                                      'inspec_id': inspec_id,
                                                      'permit_id': permit_id,
                                                      'date': i_summary[:,0],
                                                      'purpose': i_summary[:,1],
                                                      'grade': i_summary[:,2],
                                                      'n_priority': i_summary[:,3],
                                                      'cutting_edge': i_summary[:,4],
                                                      'comments': i_comments})], ignore_index=True)
    if ct_error_link > 0:
        logI.error('Failed to load %d restaurant pages.', ct_error_link)
    if ct_error_report > 0:
        logI.error('Failed to load %d inspection report pages.', ct_error_report)
    return I, V

In [11]:
start = '50'
end = '100'
file_R = 'data/phx/phoenix_R'
file_I = 'data/phx/phoenix_I'
file_V = 'data/phx/phoenix_V'

In [12]:
s = requests.Session()
R = pickle.load(open('%s_%04d.pkl' % (file_R, int(start))))
I, V = scrape_inspection_data(R, s, int(start))
print '\n Finished scraping inspection data.'
I.to_pickle('%s_%04d.pkl' % (file_I, int(start)))
V.to_pickle('%s_%04d.pkl' % (file_V, int(start)))
print 'I[%04d]: %s   V[%04d]: %s' % (i, I.shape, i, V.shape)

INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00001&i=0
INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00002&i=0
INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00003&i=0
INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00004&i=0
INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00005&i=0
INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00006&i=0
INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00007&i=0
INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00008&i=0
INFO:logI:Accessing link: /EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00009&i=0
INFO:logI:Accessing link: /E

ValueError: arrays must all be same length


<div class="col-xs-12 Row regularText">
                <div class="col-xs-2 Cell" style="text-align: center"><a href="/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-97-0016&amp;i=0">MF-97-0016</a></div>
                <div class="col-xs-4 Cell" style="text-align: left">Kevins Chuck Wagon</div>
                <div class="col-xs-5 Cell" style="text-align: left"><a href="http://maps.google.com/maps?q=  cold truck   @3 , AZ " target="_blank">  cold truck   @3  </a></div>
                <div class="col-xs-1 cell" style="text-align: left"><div class="Hidden"><img src="" alt=""></div></div>
            </div>

In [95]:
R = scrape_restaurant_data([1,2])

In [107]:
R

Unnamed: 0,_id,address,cutting_edge,link,name
0,FD-00001,120 N Valentine St Wickenburg 85358,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Wickenburg Community Ctr
1,FD-00002,11820 N 81st Ave Peoria 85345,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Peoria Boys & Girls Club
2,FD-00003,613 N 4th Ave Phoenix 85003,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Phoenix Silvercrest
3,FD-00004,8561 N 61st Ave Glendale 85302,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Ywca Senior Citizens
4,FD-00005,7410 E Sutton Pl Scottsdale 85260,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Thunderbird Academy
5,FD-00006,8335 W Jefferson St Peoria 85345,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Peoria Community Center
6,FD-00007,3535 N 27th Ave Phoenix 85017,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Bostrom High School
7,FD-00008,1030 N Hayden Rd Scottsdale 85257,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,St Daniel The Prophet
8,FD-00009,2038 W Van Buren St Phoenix 85009,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,St Matthew's School
9,FD-00010,10323 W Olive Ave Peoria 85345,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Good Samaritan Society/Peoria Good Shep


In [139]:
I, V = scrape_inspection_data(R)

In [140]:
I

Unnamed: 0,comments,cutting_edge,date,grade,inspec_id,link,n_priority,permit_id,purpose
0,This establishment received a(n) A Grade and h...,,04/22/2015,A,3712916,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0,FD-00001,Routine Inspection
1,This establishment received a(n) A Grade and h...,,10/20/2014,A,3635921,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Routine Inspection
2,This establishment received a(n) A Grade and h...,,04/29/2014,A,3564035,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Routine Inspection
3,This establishment received a(n) A Grade and h...,,10/30/2013,A,3492563,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Routine Inspection
4,Establishment not accessible at time of inspec...,,07/30/2013,,3452234,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Ineffective Visit
5,This establishment received a(n) A Grade and h...,,02/27/2013,A,3389445,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Routine Inspection
6,This establishment received a(n) A Grade and h...,,11/19/2012,A,3348659,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Routine Inspection
7,Establishment not accessible at time of inspec...,,11/14/2012,,3346373,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Ineffective Visit
8,This establishment received a(n) A Grade and h...,,07/21/2015,A,3751656,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0,FD-00002,Routine Inspection
9,This establishment received a(n) A Grade and h...,,03/17/2015,A,3695967,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0,FD-00002,Routine Inspection


In [141]:
V

Unnamed: 0,code,comments,correct_by,description,inspec_id,permit_id
0,13,"PRIORITY VIOLATION-3-302.11 (A1-2), P: Package...",Corrected At Time Of Inspection,Food separated & protected,3759182,FD-00003
1,37,"Core-3-305.11, C: Food Storage-Preventing Cont...",Corrected At Time Of Inspection,Contamination prevented during food preparatio...,3759182,FD-00003
0,14,"Core-4-602.11 (E), C: Equipment Food-Contact S...",Corrected At Time Of Inspection,Food-contact surfaces: cleaned & sanitized,3730260,FD-00003
0,53,"Core-6-201.11, C: Floors, Walls and Ceilings-C...",Corrected At Time Of Inspection,"Physical facilities installed, maintained, & c...",3694526,FD-00003
0,13,"3-302.11 (A1-2) , P: Packaged and Unpackaged F...",Corrected At Time Of Inspection,Food separated & protected,3632840,FD-00003
1,14,"4-602.11 (E) , C: Equipment Food-Contact Surfa...",Correct Prior To Next Routine Inspection,Food-contact surfaces: cleaned & sanitized,3632840,FD-00003
0,20,"3-501.16(A)(2) and (B) , P: Potentially Hazard...",Corrected At Time Of Inspection,Proper cold holding temperatures,3604048,FD-00003
1,31,"4-301.11, Pf: Cooling, Heating, and Holding Ca...",Correct Prior To Reinspection,Proper cooling methods used; adequate equipmen...,3604048,FD-00003
0,13,"3-302.11 (A1-2) , P: Packaged and Unpackaged F...",Corrected At Time Of Inspection,Food separated & protected,3578769,FD-00003
0,14,"4-601.11(A) , Pf: Equipment, Food-Contact Surf...",Corrected At Time Of Inspection,Food-contact surfaces: cleaned & sanitized,3542655,FD-00003


In [59]:
access_results_page(2185)

(['/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-15-0615&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-15-0616&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-15-0617&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-15-0618&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-15-0619&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-15-0620&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-94-0002&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-94-0003&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-94-0004&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=MF-94-0005&i=0'],
 [[u'MF-15-0615', u'Sopes El Esquisito', u' ', u'', u''],
  [u'MF-15-0616', u'Dogos El Yori', u' ', u'', u'']

In [147]:
logger = logging.getLogger()
logger.handlers

[<logging.StreamHandler at 0x103832fd0>]

In [149]:
# set root logger level
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)

# setup custom logger
logger = logging.getLogger(__name__)
handler = logging.FileHandler('model.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)

logger.info('test')
logger.debug('test the debug entries')
logger.warning('test the warning entries')

INFO:__main__:test
DEBUG:__main__:test the debug entries


In [169]:
access_results_page(1)

INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.maricopa.gov
DEBUG:requests.packages.urllib3.connectionpool:"GET /EnvSvc/OnlineApplication/EnvironmentalHealth/BusinessSearchResults?page=1 HTTP/1.1" 200 18096


(['/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00001&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00002&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00003&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00004&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00005&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00006&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00007&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00008&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00009&i=0',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00010&i=0'],
 [[u'FD-00001',
   u'Wickenburg Community Ctr',
   u'120 N Valentine St   Wickenburg 85358',
   u'',
   u''],
  [u'FD-00002',
   

In [47]:
access_restaurant_page('/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-00003&i=0')

(['/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3759182',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3738493',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3730260',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3694526',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3632840',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3604624',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3604048',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3578769',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3542655',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3506524',
  '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3479340',
  '/EnvSvc/OnlineApplication/EnvironmentalH

In [69]:
access_inspection_page('/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3759182')

([u'B', u'1', u'', u''],
 [[u'13 ',
   u'Food separated & protected',
   u'PRIORITY VIOLATION-3-302.11 (A1-2), P: Packaged and Unpackaged Food-Separation, Packaging, and Segregation; Protection From Cross Contamination: Observed raw eggs over apple pie filling and butter. Person in Charge rearranged items to be stored properly. Food items must be stored in a manner that prevents cross contamination.',
   u'Corrected At Time Of Inspection'],
  [u'37 ',
   u'Contamination prevented during food preparation, storage & display',
   u'Core-3-305.11, C: Food Storage-Preventing Contamination from the Premises: Observed frozen potatoes stored on ground in walk-in freezer. Person in Charge moved potatoes to shelf. Food must be stored at least 6 inches off ground at all times.',
   u'Corrected At Time Of Inspection']],
 u'This establishment received a(n) B Grade and had 1 Priority, 0 Priority Foundation and 1 Core violations on this inspection.\nNo County legal action will result from this inspec

In [60]:
z = requests.get('http://www.maricopa.gov/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3759182') #, params=payload)

# Create beautiful soup using link's content:
soup = BeautifulSoup(z.content, from_encoding='UTF-8')



In [61]:
# Git list of all rows entries (restaurants) on this page
t = soup.findAll('div',attrs={'class':'col-xs-12 Row regularText'})
t

[<div class="col-xs-12 Row regularText">
 <div class="col-xs-4 Cell">B</div>
 <div class="col-xs-4 Cell">1</div>
 <div class="col-xs-4 Cell"><div class="Hidden"><img alt="" src=""/></div></div>
 </div>, <div class="col-xs-12 Row regularText">
 <div class="col-xs-1 Cell" style="text-align: center">13 </div>
 <div class="col-xs-3 Cell" style="text-align: justify">Food separated &amp; protected</div>
 <div class="col-xs-5 Cell" style="text-align: justify">PRIORITY VIOLATION-3-302.11 (A1-2), P: Packaged and Unpackaged Food-Separation, Packaging, and Segregation; Protection From Cross Contamination: Observed raw eggs over apple pie filling and butter. Person in Charge rearranged items to be stored properly. Food items must be stored in a manner that prevents cross contamination.</div>
 <div class="col-xs-3 Cell" style="text-align: left">Corrected At Time Of Inspection</div>
 </div>, <div class="col-xs-12 Row regularText">
 <div class="col-xs-1 Cell" style="text-align: center">37 </div>
 <di

In [62]:
[[y.text for y in x.findAll('div')] for x in t]

[[u'B', u'1', u'', u''],
 [u'13 ',
  u'Food separated & protected',
  u'PRIORITY VIOLATION-3-302.11 (A1-2), P: Packaged and Unpackaged Food-Separation, Packaging, and Segregation; Protection From Cross Contamination: Observed raw eggs over apple pie filling and butter. Person in Charge rearranged items to be stored properly. Food items must be stored in a manner that prevents cross contamination.',
  u'Corrected At Time Of Inspection'],
 [u'37 ',
  u'Contamination prevented during food preparation, storage & display',
  u'Core-3-305.11, C: Food Storage-Preventing Contamination from the Premises: Observed frozen potatoes stored on ground in walk-in freezer. Person in Charge moved potatoes to shelf. Food must be stored at least 6 inches off ground at all times.',
  u'Corrected At Time Of Inspection']]

In [65]:
soup.findAll('p',attrs={'class':'col-xs-12 Row regularText'})[0].text

u'This establishment received a(n) B Grade and had 1 Priority, 0 Priority Foundation and 1 Core violations on this inspection.\nNo County legal action will result from this inspection.'

In [96]:
test = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6], 'c':[7,8,9],'d':5})

In [97]:
test

Unnamed: 0,a,b,c,d
0,1,4,7,5
1,2,5,8,5
2,3,6,9,5


In [98]:
test.loc[0,'d'] = 10

In [99]:
test

Unnamed: 0,a,b,c,d
0,1,4,7,10
1,2,5,8,5
2,3,6,9,5


In [90]:
pd.concat([test,pd.Data{'b':[7,8], 'c':[10,11], 'a':[4,5]}], axis=1, ignore_index=True)

TypeError: cannot concatenate a non-NDFrame object

In [150]:
test = pd.DataFrame(columns=['a','b','c'])

In [151]:
test

Unnamed: 0,a,b,c


In [104]:
test = '/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p=FD-00003&i=3759182'
test.replace('&','=').split('=')

['/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodInspection?p',
 'FD-00003',
 'i',
 '3759182']

In [158]:
if None.empty:
    print 'Yes'
else:
    print 'No'

AttributeError: 'NoneType' object has no attribute 'empty'

In [163]:
print '%02d' % 10

10


In [167]:
type(None)

NoneType

In [180]:
test = [{'a':[1,2,3], 'b':[4,5,6], 'c':[7,8,9],'d':5},
        {'a':[11,12,13], 'b':[14,15,16], 'c':[17,18,19],'d':15},
        {'a':[21,22,23], 'b':[24,25,26], 'c':[27,28,29],'d':25},
        {'a':[31,32,33], 'b':[34,35,36], 'c':[37,38,39],'d':35}]

In [183]:
x = pd.DataFrame.from_dict(test[0])
for d in test[1:]:
    x = pd.concat([x,pd.DataFrame.from_dict(d)], ignore_index=True)
x

Unnamed: 0,a,b,c,d
0,1,4,7,5
1,2,5,8,5
2,3,6,9,5
3,11,14,17,15
4,12,15,18,15
5,13,16,19,15
6,21,24,27,25
7,22,25,28,25
8,23,26,29,25
9,31,34,37,35


In [178]:
test

Unnamed: 0,a,b,c,d
0,"[1, 2, 3]","[4, 5, 6]","[7, 8, 9]",5
1,"[11, 12, 13]","[14, 15, 16]","[17, 18, 19]",15


In [184]:
range(200, 200, 50)

[]

In [185]:
range(200, 201, 50)

[200]

# Combine Dataframe parts

In [22]:
f_list = ['0050','0100','0128','0149','0199','0200','0250','0300','0350','0400','0450','0500',
          '0550','0600','0650','0700','0750','0800','0850','0900','0950','1000','1050','1081',
          '1100','1150','1200','1250','1300','1350','1400','1450','1500','1550','1600','1650',
          '1700','1750','1800','1850','1900','1950','2000','2050']
R_full = pd.DataFrame(columns=['permit_id','link','name','address','cutting_edge'])

for f in f_list:
    R_loop = pickle.load(open('data/phx/phoenix_R_%s.pkl' % f))
    R_full = pd.concat([R_full,R_loop], ignore_index=True)

R_full = R_full[R_full.permit_id.apply(lambda x: x[:3]=='FD-')]
    
R_full.to_pickle('data/phx/phoenix_R_full.pkl')
R_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20316 entries, 0 to 20315
Data columns (total 5 columns):
address         20316 non-null object
cutting_edge    20316 non-null object
link            20316 non-null object
name            20316 non-null object
permit_id       20316 non-null object
dtypes: object(5)
memory usage: 952.3+ KB


In [21]:
R_full.tail()

Unnamed: 0,address,cutting_edge,link,name,permit_id
20311,7602 S Avondale Blvd Avondale 85377,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Americrown,FD-47186
20312,214 W Roosevelt St C-1 Phoenix 85003,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,SoSoBa,FD-47187
20313,7602 S Avondale Blvd Avondale 85377,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Americrown,FD-47188
20314,7602 S Avondale Blvd Avondale 85377,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Americrown,FD-47189
20315,10600 E Crescent Moon Dr Scottsdale 85262,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Nothing Bundt Cakes,FD-47190


In [23]:
R_full.describe()

Unnamed: 0,address,cutting_edge,link,name,permit_id
count,20316,20316.0,20316,20316,20316
unique,14306,1.0,20316,12996,20316
top,1 Cardinal Dr Glendale 85305,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,University Of Phoenix Stadium,FD-19813
freq,93,20316.0,1,82,1


In [29]:
R_full.iloc[1252,:]['link']

'/EnvSvc/OnlineApplication/EnvironmentalHealth/FoodSearchInspection?p=FD-01640&i=0'

In [None]:
badlist = [11055, 11056, 12097, 10834, 12080, 12096, 11301, 12251, 13701, 14239, 12603,
           14238, 15111, 11906, 12979, 14401, 12981, 12980, 15164, 14467, 15165, 11984, 
           15216, 14475, 11985, 14474, 15254, 15255, 15256, 15269, 14500, 15268, 15276,
           14499, 15287, 15304]

In [6]:
i_file = '../data/phx/ls_phoenix_I.txt'
i_cols = ['inspec_id', 'permit_id', 'link', 'date', 'grade', 'n_priority', 'cutting_edge', 'comments']

v_file = '../data/phx/ls_phoenix_V.txt'
v_cols = ['inspec_id', 'permit_id', 'code', 'description', 'comments', 'correct_by']

In [26]:
def concat_files(file_list, col_names, tag):
    f_list = pd.read_csv(file_list, header=None).values.flatten().tolist()
    R_full = pd.DataFrame(columns=col_names)

    for file_ in f_list:
        with open('../data/phx/%s' % file_, 'r') as f:
            R_loop = pickle.load(f)
        R_full = pd.concat([R_full,R_loop], ignore_index=True)

    R_full.to_pickle('../data/phx/phoenix_%s_full.pkl' % tag)
    print R_full.info()

In [27]:
concat_files(i_file, i_cols, 'I')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171945 entries, 0 to 171944
Data columns (total 9 columns):
comments        171945 non-null object
cutting_edge    171945 non-null object
date            171945 non-null object
grade           171945 non-null object
inspec_id       171945 non-null object
link            171945 non-null object
n_priority      171945 non-null object
permit_id       171945 non-null object
purpose         171945 non-null object
dtypes: object(9)
memory usage: 13.1+ MB
None


In [28]:
concat_files(v_file, v_cols, 'V')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160978 entries, 0 to 160977
Data columns (total 6 columns):
code           160978 non-null object
comments       160978 non-null object
correct_by     160978 non-null object
description    160978 non-null object
inspec_id      160978 non-null object
permit_id      160978 non-null object
dtypes: object(6)
memory usage: 8.6+ MB
None


# EDA

In [30]:
R = open_pickle('../data/phx/phoenix_R_full.pkl')
I = open_pickle('../data/phx/phoenix_I_full.pkl')
V = open_pickle('../data/phx/phoenix_V_full.pkl')

In [32]:
display(R.head())
display(I.head())
display(V.head())

Unnamed: 0,address,cutting_edge,link,name,permit_id
0,120 N Valentine St Wickenburg 85358,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Wickenburg Community Ctr,FD-00001
1,11820 N 81st Ave Peoria 85345,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Peoria Boys & Girls Club,FD-00002
2,613 N 4th Ave Phoenix 85003,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Phoenix Silvercrest,FD-00003
3,8561 N 61st Ave Glendale 85302,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Ywca Senior Citizens,FD-00004
4,7410 E Sutton Pl Scottsdale 85260,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Thunderbird Academy,FD-00005


Unnamed: 0,comments,cutting_edge,date,grade,inspec_id,link,n_priority,permit_id,purpose
0,This establishment received a(n) A Grade and h...,,04/22/2015,A,3712916,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0.0,FD-00001,Routine Inspection
1,This establishment received a(n) A Grade and h...,,10/20/2014,A,3635921,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Routine Inspection
2,This establishment received a(n) A Grade and h...,,04/29/2014,A,3564035,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Routine Inspection
3,This establishment received a(n) A Grade and h...,,10/30/2013,A,3492563,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Routine Inspection
4,Establishment not accessible at time of inspec...,,07/30/2013,,3452234,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00001,Ineffective Visit


Unnamed: 0,code,comments,correct_by,description,inspec_id,permit_id
0,13,"PRIORITY VIOLATION-3-302.11 (A1-2), P: Package...",Corrected At Time Of Inspection,Food separated & protected,3759182,FD-00003
1,37,"Core-3-305.11, C: Food Storage-Preventing Cont...",Corrected At Time Of Inspection,Contamination prevented during food preparatio...,3759182,FD-00003
2,14,"Core-4-602.11 (E), C: Equipment Food-Contact S...",Corrected At Time Of Inspection,Food-contact surfaces: cleaned & sanitized,3730260,FD-00003
3,53,"Core-6-201.11, C: Floors, Walls and Ceilings-C...",Corrected At Time Of Inspection,"Physical facilities installed, maintained, & c...",3694526,FD-00003
4,13,"3-302.11 (A1-2) , P: Packaged and Unpackaged F...",Corrected At Time Of Inspection,Food separated & protected,3632840,FD-00003


In [33]:
I.shape

(171945, 9)

In [34]:
V.shape

(160978, 6)

In [36]:
V.inspec_id.nunique()

75491

In [37]:
I.inspec_id.nunique()

171933

In [73]:
I['has_v'] = I.inspec_id.isin(V.inspec_id.unique())
save_to_pickle(I, '../data/phx/phoenix_I_full.pkl')

In [39]:
I.describe()

Unnamed: 0,has_v
count,171945
mean,0.439047
std,0.496272
min,False
25%,0
50%,0
75%,1
max,True


In [43]:
I.groupby('grade').describe().unstack(-1)

Unnamed: 0_level_0,has_v,has_v,has_v,has_v,has_v,has_v,has_v,has_v
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
grade,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
,31820,0.045726,0.208893,False,0,0,0.0,True
A,63980,0.262473,0.439982,False,0,0,1.0,True
B,16874,0.995733,0.0651841,False,1,1,1.0,True
C,1957,0.997445,0.0504946,False,1,1,1.0,True
D,529,0.990548,0.0968514,False,1,1,1.0,True
Not Participating,56655,0.669561,0.470375,False,0,1,1.0,True
Re-Inspection,128,0.25,0.434714,False,0,0,0.25,True
Training,2,0.0,0.0,False,0,0,0.0,False


In [44]:
I.permit_id.nunique()

19697

In [45]:
R.permit_id.nunique()

20316

In [72]:
R['has_i'] = R.permit_id.isin(I.permit_id.unique())
R['has_v'] = R.permit_id.isin(I.permit_id[I.has_v].unique())
save_to_pickle(R, '../data/phx/phoenix_R_full.pkl')

In [48]:
R.describe()

Unnamed: 0,has_i,has_v
count,20316,20316
mean,0.969531,0.786228
std,0.171877,0.409978
min,False,False
25%,1,1
50%,1,1
75%,1,1
max,True,True


In [49]:
R.groupby('has_i').describe().unstack(-1)

Unnamed: 0_level_0,has_v,has_v,has_v,has_v,has_v,has_v,has_v,has_v
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
has_i,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
False,619,0.0,0.0,False,0,0,0,False
True,19697,0.810936,0.39157,False,1,1,1,True


In [50]:
R[R.has_i==False].head(10)

Unnamed: 0,address,cutting_edge,link,name,permit_id,has_i,has_v
10835,2150 E Southern Ave Tempe 85282,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Tempe Education & Arts Center,FD-15930,False,False
11055,767 E Broadway Rd Mesa 85204,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Tortas La Presa,FD-16595,False,False
11056,11343 E Apache Tr Apache Junction 85220,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Lucky 7 Market,FD-16597,False,False
11301,767 E Broadway Rd Mesa 85204,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Tortas La Presa,FD-17215,False,False
11906,624 W Broadway Rd Suite 205 Mesa 85210,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Super Carniceria El Tarachi No 3,FD-18629,False,False
11984,975 E Elliot Rd Suite 103 Tempe 85284,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Pizza Chicago Corp,FD-18820,False,False
11985,3400 W Chandler Blvd Suite 5 Chandler 85226,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Tonic,FD-18821,False,False
12080,1615 W Camelback Rd Suite 108 Phoenix 85015,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Little Caesar's,FD-19000,False,False
12096,1910 S Gilbert Rd Mesa 85204,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Tilted Kilt,FD-19032,False,False
12097,7337 E Shea Blvd Suite 100 Scottsdale 85258,,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,Dickey's BBQ,FD-19033,False,False


In [58]:
I.query('has_v == False & grade == "B"').head(10)

Unnamed: 0,comments,cutting_edge,date,grade,inspec_id,link,n_priority,permit_id,purpose,has_v
2225,This establishment received a(n) A Grade and h...,,02/14/2014,B,3531780,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00260,Routine Inspection,False
6470,,,08/28/2015,B,3768991,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0.0,FD-00832,Routine Inspection,False
6984,Due to the violations noted on this inspection...,,12/12/2012,B,3359846,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00891,Routine Inspection,False
8117,This establishment received a B Grade and had ...,,10/08/2015,B,3787119,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0.0,FD-01042,Routine Inspection,False
8370,This establishment received a(n) A Grade and h...,,11/27/2012,B,3351469,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-01070,Routine Inspection,False
12248,This establishment is on the Cutting Edge of f...,,04/29/2015,B,3715818,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0.0,FD-01537,Verification Visit,False
16709,This report is for the E & D permit. This insp...,,02/20/2014,B,3533633,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-02087,Routine Inspection,False
20487,,,05/14/2015,B,3722624,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0.0,FD-02612,Routine Inspection,False
26084,This establishment received a(n) B Grade and h...,,11/07/2014,B,3645201,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-03391,Routine Inspection,False
28774,,,04/08/2015,B,3706741,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0.0,FD-03740,Routine Inspection,False


In [60]:
I.query('has_v == False & grade == "B"').comments.tolist()

[u'This establishment received a(n) A Grade and had 0 Priority, 0 Priority Foundation and 0 Core violations on this inspection.\nNo County legal action will result from this inspection.',
 '',
 u'Due to the violations noted on this inspection report, County legal action may result from this inspection.\n*As of a month ago, owner started preparing burritos on the weekends. States that he has made menudo on the weekends for at least 10 years with only a bakery permit. Informed manager that an LAR will be completed and that he must get an E & D permit if he continues to make the burritos and menudo.\n\nCh. VII Reg. 4-5- department issued manager card requirement not met...certified food manager card expired in 2010. Has certificate and printed off sheet w/information and numbers to call to get his card renewed.',
 u'This establishment received a B Grade and had 1 Priority, 0 Priority Foundation and 0 Core  violations on this inspection. No County legal action will result from this inspect

In [68]:
I.query('has_v == True & grade == "B"').comments\
    .apply(lambda x: x.find("This establishment received a(n) A Grade") >= 0).sum()

72

In [69]:
I.query('has_v == True & grade == "B"')[I.query('has_v == True & grade == "B"').comments\
    .apply(lambda x: x.find("This establishment received a(n) A Grade") >= 0)].head()

Unnamed: 0,comments,cutting_edge,date,grade,inspec_id,link,n_priority,permit_id,purpose,has_v
3491,No County legal action will result from this i...,,05/12/2014,B,3569508,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-00434,Routine Inspection,True
12185,No County legal action will result from this i...,,06/12/2014,B,3582664,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-01529,Routine Inspection,True
14323,No County legal action will result from this i...,,06/17/2015,B,3737103,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0.0,FD-01805,Complaint Inspection,True
16766,This establishment received a(n) A Grade and h...,,08/13/2013,B,3457839,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,,FD-02094,Routine Inspection,True
17518,This establishment received a(n) A Grade and h...,,08/14/2015,B,3762532,/EnvSvc/OnlineApplication/EnvironmentalHealth/...,0.0,FD-02200,Routine Inspection,True


In [70]:
I.date.min()

u'01/02/2013'

In [71]:
I.date.max()

u'12/31/2014'