In [1]:
from yelp.client import Client
from yelp.oauth1_authenticator import Oauth1Authenticator
from bs4 import BeautifulSoup
import urllib
import re

### Define functions

In [2]:
def connect_API(consumer_key=None, consumer_secret=None, token=None, token_secret=None):
    """ Yelp API authorization
    :param consumer_key: Yelp API consumer_key
    :param consumer_secret: Yelp API consumer_secret
    :param token: Yelp API token
    :param token_secret: Yelp API token_secret
    :return: Yelp API client object
    """
    auth = Oauth1Authenticator(consumer_key, consumer_secret, token, token_secret)
    return Client(auth)

def get_businesses(client, offset, keywords, location):
    """
    get Yelp business objects
    :param client: Yelp API client object
    :param keywords: name of a dish
    :param location: city name
    :return: a list of Yelp business object from responses
    """
    params = {
        'term': keywords,
        'offset': offset
    }
    return client.search(location, **params).businesses


def get_data(business_id, keywords):
    """
    pull reviews and photos
    :param business_id: a Yelp business id
    :param keywords: name of a dish
    :return: a list of reviews and a list of photos relative to the keywords
    """

    # data in 1st page
    reviews = []
    photos = []
    SearchURL = 'https://www.yelp.com/biz_photos/%s?tab=food' % business_id
    try:
        yelppage = BeautifulSoup(urllib.urlopen(SearchURL).read(), "lxml")
#         print yelppage
        total_pages = yelppage.find_all('div', {'class': "page-of-pages arrange_unit arrange_unit--fill"})[0].get_text()
        total_pages = int(re.findall('of ([0-9]*)', total_pages)[0])
        reviews, photos = get_data_single_page(yelppage, keywords)
        # data in other pages (only get data from first 5 pages; otherwise, it's going to be super slow)
        if total_pages > 8:
            total_pages = 8
        for page in range(1, total_pages):
            num = 30*page
            SearchURL = 'https://www.yelp.com/biz_photos/%s?start=%s&tab=food' % (business_id, str(num))
            yelppage = BeautifulSoup(urllib.urlopen(SearchURL).read(), "lxml")
            r, p = get_data_single_page(yelppage, keywords)
            reviews += r
            photos += p
    except UnicodeError:
        pass
    return reviews, photos

def get_data_single_page(yelppage, keywords):
    """
    helper method for get_data
    :param yelppage: BeautifulSoup object
    :return: a list of reviews and a list of photos relative to the keywords
    """
    results = yelppage.find_all('img', {'alt': re.compile( keywords, re.IGNORECASE)})
    reviews = []
    photos = []
    for result in results:
        r = result.get("alt").split(".")
        if len(r) > 1:
            reviews.append(r[1])
            photos.append(result.get("src"))
    return reviews, photos

# def find_food(keywords, location):
#     """
#     get restaurant information from searching results
#     :param keywords: keywords for the dish
#     :param location: city name
#     :return: a dictionary of result (key - business_id, value - review list, photo list, business object)
#     """

#     client = connect_API(consumer_key, consumer_secret, token, token_secret)
#     businesses_list = get_businesses(client, keywords, location)

#     info_dict = {}
#     for business in businesses_list[:1]:
#         id = business.id
#         # print id
#         reviews, photos = get_data(id, keywords)
#         if len(reviews) > 0:
#             info_dict[id] = {"reviews": reviews}
#             info_dict[id]["photos"] = photos
#             info_dict[id]["business_obj"] = business
#     return info_dict

def getJson_location(business_list):
    SPACE = "  "

    headers = ["yelp_id", "address", "display_address", "city", "state_code", "postal_code",
               "country_code", "cross_streets", "neighborhoods", "latitude", "longitude"]
    json = "{\n" + SPACE + '"headers": ["' + '", "'.join(headers) + '"],\n' \
           + SPACE + '"data": [\n'\
           + SPACE * 2 
    ii = 0
    for business in businesses_list:
        print ii, business.id
        try:
            str(business.name)
            yelp_id = business.id
            address = business.location.address
            display_address = business.location.display_address
            city = business.location.city
            state_code = business.location.state_code
            postal_code = business.location.postal_code
            country_code = business.location.country_code
            cross_streets = business.location.cross_streets
            neighborhoods = business.location.neighborhoods
            latitude = business.location.coordinate.latitude
            longitude = business.location.coordinate.longitude

            json += '{\n'
            for i in headers:
                json += '"' + i + '":"' + str(eval(i)) + '", \n'
            if ii == len(businesses_list)-1:
                json +=  SPACE * 2 + '}\n'
            else:
                json +=  SPACE * 2 + '},\n'
            ii += 1
        except UnicodeError:
            pass
    json += SPACE + "\n" + SPACE + "]" + "\n}"
    return json

def getJson_business(businesses_list):
    SPACE = "  "

    headers = ["yelp_id", "name", "website", "phone", "rating", "review_count",
               "address", "yelp_deals", "reviews", "photos"]
    json = "{\n" + SPACE + '"headers": ["' + '", "'.join(headers) + '"],\n' \
           + SPACE + '"data": [\n'\
           + SPACE * 2 
    ii = 0
    for business in businesses_list:
        print ii, business.id
        try:
            str(business.name)
            yelp_id = business.id
            name = business.name
            website = business.url
            phone = business.phone
            rating = business.rating
            review_count = business.review_count
            address = None
            if len(business.location.address) > 0:
                address = business.location.address[0]
            yelp_deals = business.deals
            reviews, photos = get_data(yelp_id, name)
            for j in range(len(reviews)):
                json += '{\n'
                for i in headers:
                    if i == "photos":
                        json += '"' + i + '":' + '"' + photos[j] + '" \n' 
                    elif i == "reviews":
                        json += '"' + i + '":' + '"' + reviews[j].replace('"', '') + '", \n' 
                    else:
                        json += '"' + i + '":"' + str(eval(i)) + '", \n'
                if ii == len(businesses_list)-1:
                    json +=  SPACE * 2 + '}\n'
                else:
                    json +=  SPACE * 2 + '},\n'
            ii += 1
        except UnicodeError:
            pass
    json += SPACE + "\n" + SPACE + "]" + "\n}"
    return json

 ### Connect Yelp API and get search responses

In [3]:
# main
consumer_key="8WhklGO_09rGkQZy9xMBQQ"
consumer_secret="aSYhuXHB0U__JcYxWmBU-qm5a5k"
token="ud25l_lGjq4PJPqxX_NXhSQHCbXWdZjq"
token_secret="n4jTgD1PzwiWCSVg-hHac8WHx8o"

keywords = "food"
location = "San jose, CA"

In [4]:
client = connect_API(consumer_key, consumer_secret, token, token_secret)

### Get all response from Yelp API

In [5]:
n = 0
businesses_list = []
for i in range(2000):
    try:
        businesses_list += get_businesses(client, n, keywords, location)
        n += 20
    except InvalidParameter:
        break

NameError: name 'InvalidParameter' is not defined

In [6]:
len(businesses_list)

1000

In [27]:
# for i in businesses_list:
#     print i.location.coordinate

### Create JSON data

In [26]:
SPACE = "  "

headers = ["yelp_id", "name", "website", "phone", "rating", "review_count",
           "address", "yelp_deals", "reviews", "photos"]
json = "{\n" + SPACE + '"headers": ["' + '", "'.join(headers) + '"],\n' \
       + SPACE + '"data": [\n'\
       + SPACE * 2 
ii = 0
for business in businesses_list[650:]:
    print ii, business.id
    try:
        str(business.name)
        yelp_id = business.id
        name = business.name
        website = business.url
        phone = business.phone
        rating = business.rating
        review_count = business.review_count
        address = None
        if len(business.location.address) > 0:
            address = business.location.address[0]
        yelp_deals = business.deals
        reviews, photos = get_data(yelp_id, name)
        for j in range(len(reviews)):
            json += '{\n'
            for i in headers:
                if i == "photos":
                    json += '"' + i + '":' + '"' + photos[j] + '" \n' 
                elif i == "reviews":
                    json += '"' + i + '":' + '"' + reviews[j].replace('"', '') + '", \n' 
                else:
                    json += '"' + i + '":"' + str(eval(i)) + '", \n'
            if ii == len(businesses_list[650:])-1:
                json +=  SPACE * 2 + '}\n'
            else:
                json +=  SPACE * 2 + '},\n'
        ii += 1
    except UnicodeError:
        pass
json += SPACE + "\n" + SPACE + "]" + "\n}"
# print json

0 yakiniq-sj-san-jose


IndexError: list index out of range

In [53]:
# json = getJson_business(businesses_list[193:])
# print json

In [35]:
# json = getJson_location(businesses_list[:1])

In [23]:
# save JSON file
f = open("business_san_jose1.json", "w")
f.write(json.encode("utf-8"))
f.close()

In [None]:
321 billys-hotdogs-san-jose
322 ono-hawaiian-bbq-san-jose
323 izzo-restaurant-san-jose
324 bread-box-san-jose
325 fireside-caffe-deli-and-gelato-san-jose-2