In [1]:
# setup library imports
import io, time, json
import requests
import pandas as pd
import numpy as np

from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
#Read the Yelp API key from file
def read_api_key(filepath="api_key.txt"):
    return Path(filepath).read_text().strip()

In [3]:
api_key = read_api_key()
api_key

'Y0vpAcCzpLY3l5VSChBzAcRpy-JrWmmaOenfUf-AGrC4lKtc79YDH503ZZSURFVGsAx_I1-Xo0T6YykBPmaOalvnGubVhpIH_K0kfIcWEh0FLftyNyUQ75MXaW0wYHYx'

In [4]:
#return the raw HTML for the specified URL
def retrieve_html(url):
    r = requests.get(url, auth=('user', 'pass'))   
    return (r.status_code, r.text)

In [5]:
#Make an authenticated request to the Yelp API.
def yelp_search(api_key, term, location):
    
    search_url = "https://api.yelp.com/v3/businesses/search"
    
    params = {"location": location, "term": term, "categories" : "restaurants"}
    
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    
    response = requests.get(search_url, params = params, headers = headers)
    
    data = json.loads(response.text)
    
    return (data["total"], data["businesses"])


In [6]:
(total, businesses) = yelp_search(api_key, "taco", "University District, Seattle")
total

156

In [7]:
#Retrieve ALL the restaurants on Yelp for a given query.
#returns the api response as a list of dictionaries
#max number of responses is 1000
def all_restaurants(api_key, term, location):
    search_url = "https://api.yelp.com/v3/businesses/search"
    
    params1 = {"location": location, "term": term, "categories" : "restaurants"}
    
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    
    response1 = requests.get(search_url, params = params1, headers = headers)   
    data1 = json.loads(response1.text)
    
    #Note that only 1000 records can be obtained at one time
    records_num = data1['total']    
    requests_num = records_num//20 + 1
    offset = 0
    result = []
    
    for i in range(requests_num):
        #20 restaurants each request
        curr_offset = offset + i*20
        params = {"location": location, "term": term, "offset": curr_offset, "categories" : "restaurants"}
        response = requests.get(search_url, params = params, headers = headers)
        data = json.loads(response.text)
        result += data["businesses"]
    #pause slightly between requests
        time.sleep(.300)
        
    return result

In [8]:
tacos = all_restaurants(api_key, "taco", "University District, Seattle")
type(tacos)

list

In [9]:
print(len(tacos))
tacos[6]

156


{'id': 'ZTFVwQaFVhp_4BU33xoclg',
 'alias': 'the-counter-at-old-ballard-catering-seattle',
 'name': 'The Counter At Old Ballard Catering',
 'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/4pqkrTkLd8Fgn1FYieyB9g/o.jpg',
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/the-counter-at-old-ballard-catering-seattle?adjust_creative=Yd84IPqpgzteXDQ2QE83uA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=Yd84IPqpgzteXDQ2QE83uA',
 'review_count': 35,
 'categories': [{'alias': 'newamerican', 'title': 'American (New)'},
  {'alias': 'greek', 'title': 'Greek'},
  {'alias': 'tacos', 'title': 'Tacos'}],
 'rating': 5.0,
 'coordinates': {'latitude': 47.675679, 'longitude': -122.311566},
 'transactions': ['pickup', 'delivery'],
 'location': {'address1': '1505 NE 65th St',
  'address2': '',
  'address3': None,
  'city': 'Seattle',
  'zip_code': '98115',
  'country': 'US',
  'state': 'WA',
  'display_address': ['1505 NE 65th St', 'Seattle, WA 98115']},
 'phone': '+120625747

In [10]:
#parse the api response into a pandas dataframe
#api response is all the restaurants matched
def parse_api_response(api_response):
    
    df = pd.DataFrame(columns=('name','category','latitude','longitude','price','rating','url', 'review_count'))
    
    for i in range(len(api_response)): 
        
        restaurant = api_response[i]
        name = restaurant['name']
        category_list = []
        for category in restaurant['categories']:
            category_list.append(category['alias'])
        seperator = ','
        category_string = seperator.join(category_list)
        
        coordinates = restaurant['coordinates']
        latitude = coordinates['latitude']
        longitude = coordinates['longitude']
        
        if 'price' in restaurant:
            price = restaurant['price']
            price_indicator = price.count('$')
        else:
            price_indicator = 'NA'
        
        rating = restaurant['rating']
        url = restaurant['url']
        review_count = restaurant['review_count']
            
        df.loc[i] = [name, category_string, latitude, longitude, price_indicator, rating, url, review_count]
    
    return df

In [11]:
taco_restaurants_df = parse_api_response(tacos)
taco_restaurants_df

Unnamed: 0,name,category,latitude,longitude,price,rating,url,review_count
0,Guanaco's Tacos Pupuseria,"salvadoran,tacos,gluten_free",47.657141,-122.314029,2,4.0,https://www.yelp.com/biz/guanacos-tacos-pupuse...,337
1,Off the Rez,"foodtrucks,burgers,tacos",47.659920,-122.311825,1,4.0,https://www.yelp.com/biz/off-the-rez-seattle?a...,195
2,El Camion,"foodtrucks,mexican",47.661607,-122.287411,1,4.0,https://www.yelp.com/biz/el-camion-seattle-16?...,82
3,TNT Taqueria,mexican,47.661509,-122.332940,1,4.0,https://www.yelp.com/biz/tnt-taqueria-seattle?...,442
4,Agua Verde Cafe,"mexican,venues,breakfast_brunch",47.651610,-122.314410,2,3.5,https://www.yelp.com/biz/agua-verde-cafe-seatt...,995
...,...,...,...,...,...,...,...,...
151,Brouwer's Cafe,"modern_european,pubs,belgian",47.651630,-122.354230,2,4.0,https://www.yelp.com/biz/brouwers-cafe-seattle...,984
152,The Westy Roosevelt,"sportsbars,tradamerican",47.675610,-122.315020,2,4.0,https://www.yelp.com/biz/the-westy-roosevelt-s...,54
153,Revel,"korean,newamerican,cocktailbars",47.652033,-122.354123,2,3.5,https://www.yelp.com/biz/revel-seattle?adjust_...,1386
154,Ivar's Salmon House,"seafood,lounges,tradamerican",47.653620,-122.324040,2,3.5,https://www.yelp.com/biz/ivars-salmon-house-se...,1228


In [25]:
#Parse the reviews on a single page of a restaurant.
def parse_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    #print(soup.prettify())
    review_soups = soup.find_all("script",type="application/ld+json")
    
    description_list = []
    for soup in review_soups:
        #bs4 uses .string, not .text
        text = soup.string
        #decode the json into python dict
        js_dict = json.loads(text)
        
        if 'review' in js_dict:
            review_list = js_dict['review']
        
            for i in range(len(review_list)):
                review_dict = review_list[i]
                description_list.append(review_dict['description'])

    return description_list

In [26]:
#retrieve ALL of the reviews for a single restaurant on Yelp.
#returns: reviews (list): list of dictionaries containing extracted review information
def extract_reviews(url, review_count):
    
    api_url = url + '%3Fstart%3D40'
    
    html_obj = retrieve_html(url)[1]
    
    review_list = parse_page(html_obj)
    
    result = review_list
    
    num_pages = review_count//20 + 1
    
    for i in range(1, num_pages):
        curr_offset = i*20
        curr_url = api_url + '&start=%d'%curr_offset
        
        curr_page_reviews = parse_page(retrieve_html(curr_url)[1])
        
        #print(curr_url)
        #print(curr_page_reviews)
        #print('\n\n\n')
        result += curr_page_reviews
    
    return result


In [27]:
test_url = taco_restaurants_df.loc[1]['url']
test_url

'https://www.yelp.com/biz/off-the-rez-seattle?adjust_creative=Yd84IPqpgzteXDQ2QE83uA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=Yd84IPqpgzteXDQ2QE83uA'

In [28]:
test_reviews = extract_reviews(test_url, 82)

In [29]:
len(test_reviews)

100

In [30]:
test_reviews

['Exceptional service! It was fun watching him gently knead the dough and then dip it in the fryer. Husband had a bison taco and I had pulled pork and both were outstanding! He had the fry bread with their house made lemon curd and it was divine. 10 stars!',
 'Off the Rez is amazing!! 10/10 would always get the bison taco or something sweet!! The coffee is amazing also!',
 'This is interesting combo: Indian flatbread with contemporary Taco.  I ordered bison and chili choices and both are pretty good.  Indian Flatbread was fluffy, the meat and the topping mixed well, together was a great intriguing snack.  It was a small taco so you will need 2 for lunch/dinner. \nAlso the cafe is next to the museum so You can tell the entrance staff withiutbbiuyjng the ticket and park the lot in front of the museum and cafe.  Pay by mobile or the machine for parking fee (the day we visited, the machine was broken, we paid and extended hours by mobile app).  \nThe staffs were nice and the cafe was clean

In [31]:
len(list(set(test_reviews)))

20

In [32]:
taco_restaurants_df['url']

0      https://www.yelp.com/biz/guanacos-tacos-pupuse...
1      https://www.yelp.com/biz/off-the-rez-seattle?a...
2      https://www.yelp.com/biz/el-camion-seattle-16?...
3      https://www.yelp.com/biz/tnt-taqueria-seattle?...
4      https://www.yelp.com/biz/agua-verde-cafe-seatt...
                             ...                        
151    https://www.yelp.com/biz/brouwers-cafe-seattle...
152    https://www.yelp.com/biz/the-westy-roosevelt-s...
153    https://www.yelp.com/biz/revel-seattle?adjust_...
154    https://www.yelp.com/biz/ivars-salmon-house-se...
155    https://www.yelp.com/biz/royal-grinders-seattl...
Name: url, Length: 156, dtype: object

In [None]:
test_url = taco_restaurants_df.loc[155]['url']
test_url
test_reviews = extract_reviews(test_url, 629)

In [None]:
len(test_reviews)
len(list(set(test_reviews)))