## This is a scraper to collect review information from Yelp using YelpAPI

In [1]:
## Import packages
from yelpapi import YelpAPI
import pandas as pd

In [2]:
## Need Valid Yelp ID & API Key to run.
client_id = ''
api_key = ''

In [11]:
## First, let's define our "search" function according to Yelp API guidelines
def search(api_key, term, location, limit, offset):
    """Initiates API call, collects data from Yelp based on search term, location, limit, offset"""
    yelp_api = YelpAPI(api_key)
    response = yelp_api.search_query(term = term,
                                     location = location,
                                     limit = limit,
                                    offset= offset)
    cols = list(response['businesses'][0].keys())
    data = pd.DataFrame(columns=cols)
    for biz in response['businesses']:
        data = data.append(biz, ignore_index=True)
    return data

In [4]:
## Next, let's define a helper function to unpack the output from the API call
def unpack_location(loc):
    """Unpacks output from API call & converts it to a pandas Series"""
    city = loc['city']
    state = loc['state']
    zip_code = loc['zip_code']
    country = loc['country']
    address = ' '.join(loc['display_address'])
    return pd.Series([city, state, zip_code, country, address], index = ['city', 'state', 'zip_code', 'country', 'address'])

In [5]:
## Now lets define the "collect" function which encompasses the 2 functions defined above
def collect_info(api_key, term, location, limit, offset):
    """Iterates through multiple pages of search results by running the search function defined above in loop"""
    results = pd.DataFrame()
    while offset < 100:
        output = search(api_key, term, location, limit, offset)
        output = pd.concat([output, output['location'].apply(unpack_location)], axis=1)
        keep_cols = ['id', 'name', 'is_closed', 'url', 'review_count', 'rating', 'city', 'state', 'zip_code', 'country']
        output = output[keep_cols]
        results = results.append(output, ignore_index=True)
        offset += 50
    return results

In [7]:
## Finally, the "main" function wraps all the functions above
def main(api_key, term, location, limit, offset):
    """Tabulates final result into a pandas dataframe"""
    results = collect_info(api_key, term, location, limit, offset)
    return results

In [8]:
## Test run to collect 500 entries for restaurant searches in Harlem, Manhattan, NY
term = 'restaurant'
location = 'Harlem, Manhattan, NY'
limit = 50
offset = 0

df = main(api_key, term, location, limit, offset)
df['review_count'] = df['review_count'].astype(float)
df.shape

(500, 10)

In [9]:
## Our result looks like below
df.head()

Unnamed: 0,id,name,is_closed,url,review_count,rating,city,state,zip_code,country
0,2iW5TU4W0BN8LYBzVmO9Qw,The Honey Well,False,https://www.yelp.com/biz/the-honey-well-new-yo...,245.0,4.5,New York,NY,10031,US
1,K_0tNvQd7gyabfJrodf-tw,Archer & Goat,False,https://www.yelp.com/biz/archer-and-goat-new-y...,122.0,4.5,New York,NY,10026,US
2,ej_pg-wc-ZtexQKPPiQ_5w,The Edge Harlem,False,https://www.yelp.com/biz/the-edge-harlem-new-y...,646.0,4.0,New York,NY,10030,US
3,i9OqZT0dLMPC1TbMtyaIWQ,The Noodle,False,https://www.yelp.com/biz/the-noodle-new-york?a...,112.0,4.5,New York,NY,10027,US
4,zWem2SAbFc7lIYR2y9DE0g,Harlem Public,False,https://www.yelp.com/biz/harlem-public-new-yor...,1016.0,4.0,New York,NY,10031,US


In [10]:
## Un-cmment-out to save dataframe as a csv file locally
#df.to_csv('yelp_listings.csv')