# DATA515 Yelp Reviews

## Data Collection Demo
Example of data collection using both Yelp Fusion API and web scraper.

Note that all the functions in this notebook is modulized into python files in data_collection folder!

This is just some of our early work!

In [1]:
# setup library imports
import io, time, json
import requests
import pandas as pd
import numpy as np

from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
#Read the Yelp API key from file
def read_api_key(filepath="api_key.txt"):
    return Path(filepath).read_text().strip()

In [3]:
api_key = read_api_key()
api_key

'Y0vpAcCzpLY3l5VSChBzAcRpy-JrWmmaOenfUf-AGrC4lKtc79YDH503ZZSURFVGsAx_I1-Xo0T6YykBPmaOalvnGubVhpIH_K0kfIcWEh0FLftyNyUQ75MXaW0wYHYx'

In [4]:
#return the raw HTML for the specified URL
def retrieve_html(url):
    r = requests.get(url, auth=('user', 'pass'))   
    return (r.status_code, r.text)

In [5]:
#Make an authenticated request to the Yelp API.
def yelp_search(api_key, term, location):
    
    search_url = "https://api.yelp.com/v3/businesses/search"
    
    params = {"location": location, "term": term, "categories" : "restaurants"}
    
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    
    response = requests.get(search_url, params = params, headers = headers)
    
    data = json.loads(response.text)
    
    return (data["total"], data["businesses"])


In [6]:
(total, businesses) = yelp_search(api_key, "taco", "University District, Seattle")
total

156

In [7]:
#Retrieve ALL the restaurants on Yelp for a given query.
#returns the api response as a list of dictionaries
#max number of responses is 1000
def all_restaurants(api_key, term, location):
    search_url = "https://api.yelp.com/v3/businesses/search"
    
    params1 = {"location": location, "term": term, "categories" : "restaurants"}
    
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }
    
    response1 = requests.get(search_url, params = params1, headers = headers)   
    data1 = json.loads(response1.text)
    
    #Note that only 1000 records can be obtained at one time
    records_num = data1['total']    
    requests_num = records_num//20 + 1
    offset = 0
    result = []
    
    for i in range(requests_num):
        #20 restaurants each request
        curr_offset = offset + i*20
        params = {"location": location, "term": term, "offset": curr_offset, "categories" : "restaurants"}
        response = requests.get(search_url, params = params, headers = headers)
        data = json.loads(response.text)
        result += data["businesses"]
    #pause slightly between requests
        time.sleep(.300)
        
    return result

In [8]:
tacos = all_restaurants(api_key, "taco", "University District, Seattle")
type(tacos)

list

In [9]:
print(len(tacos))
tacos[6]

156


{'id': 'nwrKWcWG1_g8nyUaaUwqlA',
 'alias': 'tacos-chukis-seattle-5',
 'name': 'Tacos Chukis',
 'image_url': 'https://s3-media4.fl.yelpcdn.com/bphoto/huHHXRboy1-VPKK376t8zw/o.jpg',
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/tacos-chukis-seattle-5?adjust_creative=Yd84IPqpgzteXDQ2QE83uA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=Yd84IPqpgzteXDQ2QE83uA',
 'review_count': 410,
 'categories': [{'alias': 'tacos', 'title': 'Tacos'},
  {'alias': 'sandwiches', 'title': 'Sandwiches'}],
 'rating': 4.5,
 'coordinates': {'latitude': 47.6270336126469, 'longitude': -122.342103158844},
 'transactions': ['delivery'],
 'price': '$',
 'location': {'address1': '832 Dexter Ave N',
  'address2': '',
  'address3': None,
  'city': 'Seattle',
  'zip_code': '98109',
  'country': 'US',
  'state': 'WA',
  'display_address': ['832 Dexter Ave N', 'Seattle, WA 98109']},
 'phone': '+12065189025',
 'display_phone': '(206) 518-9025',
 'distance': 4693.45484203039}

In [10]:
#parse the api response into a pandas dataframe
#api response is all the restaurants matched
def parse_api_response(api_response):
    
    df = pd.DataFrame(columns=('name','category','latitude','longitude','price','rating','url', 'review_count'))
    
    for i in range(len(api_response)): 
        
        restaurant = api_response[i]
        name = restaurant['name']
        category_list = []
        for category in restaurant['categories']:
            category_list.append(category['alias'])
        seperator = ','
        category_string = seperator.join(category_list)
        
        coordinates = restaurant['coordinates']
        latitude = coordinates['latitude']
        longitude = coordinates['longitude']
        
        if 'price' in restaurant:
            price = restaurant['price']
            price_indicator = price.count('$')
        else:
            price_indicator = 'NA'
        
        rating = restaurant['rating']
        url = restaurant['url']
        review_count = restaurant['review_count']
            
        df.loc[i] = [name, category_string, latitude, longitude, price_indicator, rating, url, review_count]
    
    return df

In [11]:
taco_restaurants_df = parse_api_response(tacos)
taco_restaurants_df

Unnamed: 0,name,category,latitude,longitude,price,rating,url,review_count
0,El Camion,"foodtrucks,mexican",47.661607,-122.287411,1,4.0,https://www.yelp.com/biz/el-camion-seattle-16?...,83
1,Off the Rez,"foodtrucks,burgers,tacos",47.659920,-122.311825,1,4.0,https://www.yelp.com/biz/off-the-rez-seattle?a...,195
2,Guanaco's Tacos Pupuseria,"salvadoran,tacos",47.657141,-122.314029,2,4.0,https://www.yelp.com/biz/guanacos-tacos-pupuse...,338
3,Rancho Bravo Tacos,"mexican,foodtrucks",47.661190,-122.326510,1,4.0,https://www.yelp.com/biz/rancho-bravo-tacos-se...,465
4,TNT Taqueria,mexican,47.661509,-122.332940,1,4.0,https://www.yelp.com/biz/tnt-taqueria-seattle?...,446
...,...,...,...,...,...,...,...,...
151,Portage Bay Cafe - Roosevelt,"newamerican,breakfast_brunch,cafes",47.657570,-122.317600,2,4.0,https://www.yelp.com/biz/portage-bay-cafe-roos...,2426
152,The Lodge Sports Grille,"sportsbars,tradamerican",47.690728,-122.355599,2,2.5,https://www.yelp.com/biz/the-lodge-sports-gril...,198
153,Ivar's Salmon House,"seafood,lounges,tradamerican",47.653620,-122.324040,2,3.5,https://www.yelp.com/biz/ivars-salmon-house-se...,1228
154,Revel,"korean,newamerican,cocktailbars",47.652033,-122.354123,2,3.5,https://www.yelp.com/biz/revel-seattle?adjust_...,1385


In [12]:
#Parse the reviews on a single page of a restaurant.
def parse_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    #print(soup.prettify())
    review_soups = soup.find_all("script",type="application/ld+json")
    
    description_list = []
    for soup in review_soups:
        #bs4 uses .string, not .text
        text = soup.string
        #decode the json into python dict
        js_dict = json.loads(text)
        
        if 'review' in js_dict:
            review_list = js_dict['review']
        
            for i in range(len(review_list)):
                review_dict = review_list[i]
                description_list.append(review_dict['description'])

    return description_list

In [13]:
#retrieve ALL of the reviews for a single restaurant on Yelp.
#returns: reviews (list): list of dictionaries containing extracted review information
def extract_reviews(url):
    
    api_url = url + '%3Fstart%3D40'
    
    html_obj = retrieve_html(url)[1]
    
    review_list = parse_page(html_obj)
    
    result = review_list
    
    return result
