In [1]:
# setup library imports
import io, time, json
import requests
import pandas as pd
import numpy as np

from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
def read_api_key(filepath="api_key.txt"):
    """
    Read the Yelp API Key from file.
    
    Args:
        filepath (string): File containing API Key
    Returns:
        api_key (string): The API Key
    """
    
    return Path(filepath).read_text().strip()

In [3]:
api_key = read_api_key()
api_key

'Y0vpAcCzpLY3l5VSChBzAcRpy-JrWmmaOenfUf-AGrC4lKtc79YDH503ZZSURFVGsAx_I1-Xo0T6YykBPmaOalvnGubVhpIH_K0kfIcWEh0FLftyNyUQ75MXaW0wYHYx'

In [4]:
def retrieve_html(url):
    """
    Return the raw HTML at the specified URL.

    Args:
        url (string): 

    Returns:
        status_code (integer):
        raw_html (string): the raw HTML content of the response, properly encoded according to the HTTP headers.
    """
    r = requests.get(url, auth=('user', 'pass'))   
    return (r.status_code, r.text)

In [5]:
#retrieve_html('https://www.yelp.com/biz/man-vs-fries-seattle-2?osq=Tacos')

In [7]:
def yelp_search(api_key, term, location):
    """
    Make an authenticated request to the Yelp API.

    Args:
        query (string): Search term

    Returns:
        total (integer): total number of businesses on Yelp corresponding to the query
        businesses (list): list of dicts representing each business
    """
    
    search_url = "https://api.yelp.com/v3/businesses/search"
    
    params = {"location": location, "term": term, "categories" : "restaurants"}
    
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    
    response = requests.get(search_url, params = params, headers = headers)
    
    data = json.loads(response.text)
    
    return (data["total"], data["businesses"])


In [8]:
(total, businesses) = yelp_search(api_key, "taco", "University District, Seattle")
total

156

In [9]:
#returns the api response as a list of dictionaries
#max number of responses is 1000
def all_restaurants(api_key, term, location):
    """
    Retrieve ALL the restaurants on Yelp for a given query.

    Args:
        term: keyword
        location: location to search

    Returns:
        results (list): list of dicts representing each business
    """
 
    search_url = "https://api.yelp.com/v3/businesses/search"
    
    params1 = {"location": location, "term": term, "categories" : "restaurants"}
    
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    
    response1 = requests.get(search_url, params = params1, headers = headers)   
    data1 = json.loads(response1.text)
    
    #Note that only 1000 records can be obtained at one time
    records_num = data1['total']
    
    requests_num = records_num//20 + 1
    offset = 0
    result = []
    
    for i in range(requests_num):
        
        #20 restaurants each request
        curr_offset = offset + i*20
        params = {"location": location, "term": term, "offset": curr_offset, "categories" : "restaurants"}
        response = requests.get(search_url, params = params, headers = headers)
        
        data = json.loads(response.text)
        
        #print(len(data['businesses']))
        
        #for x in data['businesses']:
            #print(x['name'])
        result += data["businesses"]
    #pause slightly between requests
        time.sleep(.300)
        
    return result

#data = all_restaurants(read_api_key(), 'Polish Hill, Pittsburgh')
#print(len(data))
#print([x['name'] for x in data])

In [10]:
tacos = all_restaurants(api_key, "taco", "University District, Seattle")
type(tacos)

list

In [11]:
print(len(tacos))
tacos[6]

156


{'id': 'Dz2iqMDZyGA50g9VXT89QA',
 'alias': 'mas-cafe-seattle',
 'name': 'Mas Cafe',
 'image_url': 'https://s3-media4.fl.yelpcdn.com/bphoto/QYhm9DjgQZijEWA5fCwOKQ/o.jpg',
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/mas-cafe-seattle?adjust_creative=Yd84IPqpgzteXDQ2QE83uA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=Yd84IPqpgzteXDQ2QE83uA',
 'review_count': 16,
 'categories': [{'alias': 'coffee', 'title': 'Coffee & Tea'},
  {'alias': 'mexican', 'title': 'Mexican'},
  {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'}],
 'rating': 4.5,
 'coordinates': {'latitude': 47.648195, 'longitude': -122.334589},
 'transactions': ['delivery', 'pickup'],
 'location': {'address1': '1906 N 34th St',
  'address2': '',
  'address3': None,
  'city': 'Seattle',
  'zip_code': '98103',
  'country': 'US',
  'state': 'WA',
  'display_address': ['1906 N 34th St', 'Seattle, WA 98103']},
 'phone': '+12064204623',
 'display_phone': '(206) 420-4623',
 'distance': 2633.712

In [12]:
#parse the api response into a pandas dataframe
#api response is all the restaurants matched
def parse_api_response(api_response):
    
    df = pd.DataFrame(columns=('name','category','latitude','longitude','price','rating','url', 'review_count'))
    
    for i in range(len(api_response)): 
        
        restaurant = api_response[i]
        name = restaurant['name']
        category_list = []
        for category in restaurant['categories']:
            category_list.append(category['alias'])
        seperator = ','
        category_string = seperator.join(category_list)
        
        coordinates = restaurant['coordinates']
        latitude = coordinates['latitude']
        longitude = coordinates['longitude']
        
        if 'price' in restaurant:
            price = restaurant['price']
            price_indicator = price.count('$')
        else:
            price_indicator = 'NA'
        
        rating = restaurant['rating']
        url = restaurant['url']
        review_count = restaurant['review_count']
            
        df.loc[i] = [name, category_string, latitude, longitude, price_indicator, rating, url, review_count]
    
    return df

In [13]:
taco_restaurants_df = parse_api_response(tacos)
taco_restaurants_df

Unnamed: 0,name,category,latitude,longitude,price,rating,url,review_count
0,Off the Rez,"foodtrucks,burgers,tacos",47.659920,-122.311825,1,4.0,https://www.yelp.com/biz/off-the-rez-seattle?a...,195
1,El Camion,"foodtrucks,mexican",47.661607,-122.287411,1,4.0,https://www.yelp.com/biz/el-camion-seattle-16?...,82
2,Guanaco's Tacos Pupuseria,"salvadoran,tacos,gluten_free",47.657141,-122.314029,2,4.0,https://www.yelp.com/biz/guanacos-tacos-pupuse...,337
3,Rancho Bravo Tacos,"mexican,foodtrucks",47.661190,-122.326510,1,4.0,https://www.yelp.com/biz/rancho-bravo-tacos-se...,465
4,TNT Taqueria,mexican,47.661509,-122.332940,1,4.0,https://www.yelp.com/biz/tnt-taqueria-seattle?...,442
...,...,...,...,...,...,...,...,...
151,The Lodge Sports Grille,"sportsbars,tradamerican",47.690728,-122.355599,2,2.5,https://www.yelp.com/biz/the-lodge-sports-gril...,198
152,The Westy Roosevelt,"sportsbars,tradamerican",47.675610,-122.315020,2,4.0,https://www.yelp.com/biz/the-westy-roosevelt-s...,54
153,Revel,"korean,newamerican,cocktailbars",47.652033,-122.354123,2,3.5,https://www.yelp.com/biz/revel-seattle?adjust_...,1385
154,Ivar's Salmon House,"seafood,lounges,tradamerican",47.653620,-122.324040,2,3.5,https://www.yelp.com/biz/ivars-salmon-house-se...,1228


In [14]:
def parse_page(html):
    """
    Parse the reviews on a single page of a restaurant.
    
    Args:
        html (string): String of HTML corresponding to a Yelp restaurant

    Returns:
        tuple(list, string): a tuple of two elements
            first element: list of dictionaries corresponding to the extracted review information
            second element: URL for the next page of reviews (or None if it is the last page)
    """
    
    soup = BeautifulSoup(html, 'html.parser')
    
    review_soups = soup.find_all("script",type="application/ld+json")
    
    description_list = []
    for soup in review_soups:
        #bs4 uses .string, not .text
        text = soup.string
        #decode the json into python dict
        js_dict = json.loads(text)
        
        #print(js_dict)
        
        if 'review' in js_dict:
            review_list = js_dict['review']
        
            for i in range(len(review_list)):
                review_dict = review_list[i]
                description_list.append(review_dict['description'])

    return description_list

In [15]:
def extract_reviews(url, review_count):
    """
    Retrieve ALL of the reviews for a single restaurant on Yelp.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.

    Returns:
        reviews (list): list of dictionaries containing extracted review information
    """
    
    html_obj = retrieve_html(url)[1]
    
    review_list = parse_page(html_obj)
    
    result = review_list
    
    num_pages = review_count//20 + 1
    
    for i in range(1, num_pages):
        curr_offset = i*20
        curr_url = url + '?start=%d'%curr_offset
        curr_page_reviews = parse_page(retrieve_html(curr_url)[1])
        
        result += curr_page_reviews
    
    return result


In [32]:
test_url = taco_restaurants_df.loc[1]['url']
test_url

'https://www.yelp.com/biz/el-camion-seattle-16?adjust_creative=Yd84IPqpgzteXDQ2QE83uA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=Yd84IPqpgzteXDQ2QE83uA'

In [37]:
#test_reviews = extract_reviews(test_url, 82)
#Note that the api data is probabily outdated so we have more reviews than review_count!
test_reviews = extract_reviews(test_url, 150)

In [38]:
len(test_reviews)

140

In [39]:
test_reviews

['Note that El Camion IS OPEN during COVID for TAKEOUT, DELIVERY, &amp; DINE OUTSIDE!\n\nWow wow wow idk why I never came here in college, but El Camion is definitely my new favorite spot for Mexican food in U-District!\n\nMy boyfriend and I came here yesterday for dinner and were pleased by the quality of food and quality of service here. They&apos;re menu is expansive and includes everything from tacos to mole to burritos to camarones a la diabla (spicy shrimp). I ordered a chicken burrito ($9.42) and my boyfriend ordered chicken enchiladas ($12.75) to go. \n\nThe chicken burrito weighed at least a pound and was stuffed with moist chicken, cheese, cilantro, rice, black beans, sour cream, and tomatoes. I poured some salsa verde and salsa rojo on top, and it was BOMB. Every bite was just bursting with flavor! I was a happy camper with lots of yummy leftovers for breakfast the next day. Next time, I hope to order the same burrito smothered in verde sauce (+$1.50). My boyfriend also enjo