# Yelp Hotels

**[Work in progress]**

This notebook creates a .csv file with yelp information for ingestion into the Knowledge Graph.

In [1]:
import os
import json
import pandas as pd

from pathlib import Path
from yelpapi import YelpAPI
from pprint import pprint

In [2]:
# read yelp api key
with open('key.data', 'r') as key:
    api_key = key.read()

yelp_api = YelpAPI(api_key)

In [3]:
def float_range(start, stop, step):
    while round(start, 6) < stop:
        yield float(round(start, 6))
        start += step

In [4]:
term = 'hotel'

# 1 degree: approximately 111 km
# using step size of 0.009 degree
start_latitude = 32.550 # 32.670 # 32.710
stop_latitude = 33.100 # 32.770 # 32.750
step_latitude = 0.090

# 1 degree: approximaely 93 km
# using step size of 0.010 degree
start_longitude = -117.260 # -117.260 # -117.175
stop_longitude = -116.900 # -117.100 # -117.140
step_longitude = 0.100

# radius of search 500m
radius = 5000

# yelp api offset and limit
offset = 0
limit = 50

In [5]:
zone = list()
for lat in float_range(start_latitude, stop_latitude, step_latitude * 1.3):
    for long in float_range(start_longitude, stop_longitude, step_longitude):
        zone.append((lat, long))
        
for lat in float_range(start_latitude + step_latitude/1.45, stop_latitude, step_latitude * 1.3):
    for long in float_range(start_longitude + step_longitude/2, stop_longitude, step_longitude):
        zone.append((lat, long))
        
print('Number of coordinates: {}'.format(len(zone)))

Number of coordinates: 40


In [6]:
import folium
import random as rnd

zone_center = [
    round((start_latitude + stop_latitude - step_latitude) / 2, 3), 
    round((start_longitude + stop_longitude - step_longitude) / 2, 3)
]
print(zone_center)

# Build map 
map_zone = folium.Map(location=zone_center, zoom_start=10, tiles='cartodbpositron', width=800, height=600)

# folium.CircleMarker(zone_center, radius=1, color='#00FF00', fill_color='#0080bb').add_to(map_zone)
folium.Rectangle(zone, color='#0080bb', fill_color='#0080bb').add_to(map_zone)

for i in range(len(zone)):
    folium.CircleMarker(zone[i], radius=1, color='#FF0000', fill_color='#0080bb').add_to(map_zone)
    folium.Circle(zone[i], radius=radius, color='#FF0000', fill_color='#0080bb').add_to(map_zone)

map_zone

[32.78, -117.13]


In [7]:
def scrap_yelp(df_combined, latitude, longitude, radius):
    # Maximum 1000 businesses can be pulled
    scrapped = 0
    for i in range(20):
        print('\tScrapping offset: {}'.format(50 * i))
        response = yelp_api.search_query(term=term, latitude=latitude, longitude=longitude, 
                                         limit=limit, radius=radius, offset=scrapped)
        scrapped += len(response['businesses'])
        df = pd.json_normalize(response['businesses'])    

        if df_combined is not None:
            df_combined = pd.concat([df_combined, df], sort=False, ignore_index=True)
        else:       
            df_combined = df.copy()

        # More to scrapped from this location
        if response['total'] <= scrapped:
            print('Completed scrapping. Total: {}'.format(response['total']))
            break
    return df_combined.copy()

In [8]:
df_business = None 
for latitude, longitude in zone:
    print('Scrapping coordinate: ({}, {}). Radius: {}'.format(latitude, longitude, radius))
    df_business = scrap_yelp(df_business, latitude, longitude, radius)

Scrapping coordinate: (32.55, -117.26). Radius: 5000
	Scrapping offset: 0
Completed scrapping. Total: 0
Scrapping coordinate: (32.55, -117.16). Radius: 5000
	Scrapping offset: 0
Completed scrapping. Total: 10
Scrapping coordinate: (32.55, -117.06). Radius: 5000
	Scrapping offset: 0
	Scrapping offset: 50
Completed scrapping. Total: 68
Scrapping coordinate: (32.55, -116.96). Radius: 5000
	Scrapping offset: 0
Completed scrapping. Total: 29
Scrapping coordinate: (32.667, -117.26). Radius: 5000
	Scrapping offset: 0
Completed scrapping. Total: 2
Scrapping coordinate: (32.667, -117.16). Radius: 5000
	Scrapping offset: 0
	Scrapping offset: 50
Completed scrapping. Total: 73
Scrapping coordinate: (32.667, -117.06). Radius: 5000
	Scrapping offset: 0
Completed scrapping. Total: 38
Scrapping coordinate: (32.667, -116.96). Radius: 5000
	Scrapping offset: 0
Completed scrapping. Total: 3
Scrapping coordinate: (32.784, -117.26). Radius: 5000
	Scrapping offset: 0
	Scrapping offset: 50
Completed scrappin

In [9]:
len(df_business)

1065

In [12]:
df_business = df_business.drop_duplicates(subset=['id'])

In [13]:
len(df_business)

641

In [14]:
columns={
    'coordinates.latitude':'coordinates_latitude',
    'coordinates.longitude':'coordinates_longitude',    
    'location.address1':'location_address1',
    'location.address2':'location_address2',
    'location.address3':'location_address3',
    'location.city':'location_city',
    'location.zip_code':'location_zip_code',
    'location.country':'location_country',
    'location.state':'location_state',
    'location.display_address':'location_display_address',
}
df_business_final = df_business.rename(columns=columns)

In [15]:
df_business_final.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,transactions,...,coordinates_latitude,coordinates_longitude,location_address1,location_address2,location_address3,location_city,location_zip_code,location_country,location_state,location_display_address
0,QDL0yIR_Bo3q4gAEklGwqw,pier-south-resort-autograph-collection-san-diego,"Pier South Resort, Autograph Collection",https://s3-media2.fl.yelpcdn.com/bphoto/1qvrGs...,False,https://www.yelp.com/biz/pier-south-resort-aut...,207,"[{'alias': 'hotels', 'title': 'Hotels'}, {'ali...",3.5,[],...,32.581374,-117.13228,800 Seacoast Drive,,,San Diego,91932,US,CA,"[800 Seacoast Drive, San Diego, CA 91932]"
1,CKwcE4SjL8AiCyPGSHz6jg,hampton-inn-and-suites-imperial-beach-san-dieg...,Hampton Inn & Suites Imperial Beach San Diego,https://s3-media1.fl.yelpcdn.com/bphoto/p_Hgf1...,False,https://www.yelp.com/biz/hampton-inn-and-suite...,6,"[{'alias': 'hotels', 'title': 'Hotels'}]",4.5,[],...,32.583865,-117.116161,771 Palm Ave,,,Imperial Beach,91932,US,CA,"[771 Palm Ave, Imperial Beach, CA 91932]"
2,ZgwVcuvUSh-BXpQeNFU0wg,sand-castle-inn-and-suites-imperial-beach-2,Sand Castle Inn & Suites,https://s3-media1.fl.yelpcdn.com/bphoto/mFpNf6...,False,https://www.yelp.com/biz/sand-castle-inn-and-s...,9,"[{'alias': 'hotels', 'title': 'Hotels'}]",3.5,[],...,32.58192,-117.13146,785 Seacoast Dr,,,Imperial Beach,91932,US,CA,"[785 Seacoast Dr, Imperial Beach, CA 91932]"
3,wn_SsQj1PTMEipeRuPkhCA,hotel-jatay-tijuana,Hotel Jatay,https://s3-media2.fl.yelpcdn.com/bphoto/2KeYBK...,False,https://www.yelp.com/biz/hotel-jatay-tijuana?a...,7,"[{'alias': 'hotels', 'title': 'Hotels'}]",2.5,[],...,32.531237,-117.122986,Av Del Pacifico 570,,,Tijuana,22504,MX,BCN,"[Av Del Pacifico 570, 22504 Tijuana, Baja Cali..."
4,bkp166SyrH5PgGvxmHctkQ,hotel-martín-tijuana,Hotel Martín,https://s3-media1.fl.yelpcdn.com/bphoto/WQOLi8...,False,https://www.yelp.com/biz/hotel-mart%C3%ADn-tij...,1,"[{'alias': 'hotels', 'title': 'Hotels'}]",5.0,[],...,32.533617,-117.123087,"Av. Del Pacífico 4, Sección Monumental, Playas...",,,Tijuana,22504,MX,BCN,"[Av. Del Pacífico 4, Sección Monumental, Playa..."


In [17]:
with open('./../../data/yelp_hotel.csv', 'w') as f:
    df_business_final.to_csv(f, index=False)

In [18]:
df_csv = pd.read_csv('./../../data/yelp_hotel.csv')

In [19]:
df_coordinates = df_csv[['coordinates_latitude', 'coordinates_longitude']].dropna()

In [20]:
# Build map 
map_zone = folium.Map(location=zone_center, zoom_start=10, tiles='cartodbpositron', width=800, height=600)
for i in range(len(df_coordinates)):
    folium.CircleMarker((df_coordinates.iloc[i][0], df_coordinates.iloc[i][1]), 
                        radius=1, color='#FF0000', fill_color='#0080bb').add_to(map_zone)
map_zone

In [21]:
len(df_csv)

641

In [32]:
# df_review = None
start_again = False
for i, business_id in enumerate(df_csv['id']):
    if not start_again:
        if business_id == 'BVtSyHjWuk3tYCy986wl1g':
            start_again = True
    else:
        response = yelp_api.reviews_query(id=business_id)
        df = pd.json_normalize(response['reviews'])
        df.insert(0, 'business_id', business_id) 

        if df_review is not None:
            df_review = pd.concat([df_review, df], sort=False, ignore_index=True)
        else:       
            df_review = df.copy()
    
    if i % 100 == 0:
        print('Processing business #{}'.format(i + 1))

Processing business #1
Processing business #101
Processing business #201
Processing business #301
Processing business #401
Processing business #501
Processing business #601


In [38]:
len(df_review)

1763

In [34]:
df_review.tail(5)

Unnamed: 0,business_id,id,url,text,rating,time_created,user.id,user.profile_url,user.image_url,user.name
1758,2LsSOLbxHNgCnwbffv_LGQ,98oF1FUbtgbMPuNNjhvXmw,https://www.yelp.com/biz/the-ranch-at-bandy-ca...,I found the only perfect place to go during Co...,5.0,2020-09-13 07:29:20,ulQXGLfDXNuqd0yMavQ-Aw,https://www.yelp.com/user_details?userid=ulQXG...,,Elyse M.
1759,bUk83zR-0jTXfGVtSfjVMw,4VM2Zhw_8bXxQp3pFgF9nQ,https://www.yelp.com/biz/ramona-valley-inn-ram...,This is your basic sleep over room environment...,3.0,2019-02-13 18:58:11,KgvIuqrFDOUgJqQxI9wDuQ,https://www.yelp.com/user_details?userid=KgvIu...,https://s3-media1.fl.yelpcdn.com/photo/nM6B_tK...,Steve H.
1760,bUk83zR-0jTXfGVtSfjVMw,0VirC5TGJ_zd0JZy53MwUw,https://www.yelp.com/biz/ramona-valley-inn-ram...,I discussed each issue with the hotel. Parking...,2.0,2020-09-23 15:13:10,9XCGBvNM-RuvqPV6Yf34cA,https://www.yelp.com/user_details?userid=9XCGB...,https://s3-media2.fl.yelpcdn.com/photo/c7lzFMM...,Mark H.
1761,bUk83zR-0jTXfGVtSfjVMw,p5aXqmWDQyxplv3nlQuYWw,https://www.yelp.com/biz/ramona-valley-inn-ram...,"If I was able to give ""zero stars"" I would. W...",1.0,2020-10-24 11:31:46,Lstey-hLLBRpgqqnHOq2Xw,https://www.yelp.com/user_details?userid=Lstey...,,Eric S.
1762,okcun7M5291OfhEl8Y9_PQ,jM2EzALwawhjgpaiftCMnw,https://www.yelp.com/biz/old-ramona-hotel-gall...,"First off, this place is the best kept secret ...",5.0,2019-08-29 14:20:19,ZuQQUZuqO9xrhMDOMKxsgw,https://www.yelp.com/user_details?userid=ZuQQU...,https://s3-media4.fl.yelpcdn.com/photo/k_Vt7Jj...,Need T.


In [37]:
df_review = df_review.drop_duplicates(subset=['id'])

In [39]:
columns={
    'user.id':'user_id',
    'user.profile_url':'user_profile_url',    
    'user.image_url':'user_image_url',
    'user.name':'user_name',
}
df_review_final = df_review.rename(columns=columns)

In [41]:
with open('./../../data/yelp_hotel_review.csv', 'w') as f:
    df_review_final.to_csv(f, index=False)