In [1]:
import numpy as np 
import pandas as pd 

import requests
import urllib
import random
import time
from bs4 import BeautifulSoup as bs
import re
import json

### Using Yelp API for scraping coffee-related shops

In [2]:
api_key = '' # your yelp api key

In [3]:
# API constants, you shouldn't have to change these.
API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
BUSINESS_PATH = '/v3/businesses/'  # Business ID will come after slash.

In [4]:
end_point = 'https://api.yelp.com/v3/businesses/search'
headers = {'Authorization': 'bearer %s' % api_key}

In [5]:
# coffee_places_sg = []
# for i in range(50):
#    parameters_sg = {'term':'boba',
#                  'limit':50,
#                  'radius':10000,
#                  'offset':i*50,
#                  'location':'Singapore'}

#    response_sg = requests.get(url=end_point,params=parameters_sg,headers=headers)
#    business_search_sg = response_sg.json()
    
#    try:
#        [biz for biz in business_search_sg['businesses']] 
#    except:
#        print("no more businesses to be scraped!")
#        break
#    coffee_places_sg.extend([biz for biz in business_search_sg['businesses']])
#    print("{} sg coffee places scraped!".format(len(coffee_places_sg)))
#    sleep_duration = random.randint(120,240)
#    print('Break for {} seconds'.format(sleep_duration))
#    time.sleep(sleep_duration)
    
# df = pd.DataFrame(coffee_places_sg)

In [6]:
# df.to_csv('sg_boba_shops.csv', index=False)

### Using beautiful soup to scrape the userid, ratings, text reviews, date etc

In [7]:
# select idx of coffee shops to scrape
START_IDX = 0
END_IDX = 100

In [8]:
df = pd.read_csv('../input/coffeeshops/sg_coffee_shops.csv')
aliases_sg = df['alias']
aliases_sg = aliases_sg[START_IDX:END_IDX]

In [9]:
aliases_sg = pd.read_csv('../input/coffeeshops/shops_norating2.csv')['shop'].values[START_IDX:END_IDX]

In [10]:
all_data = pd.DataFrame()
outlet_count = 0
count_increment = 10 # each page can have max 10 ratings/reviews
for alias in aliases_sg:
    count = 0
    
    while True:
        sg_url = "https://www.yelp.com/biz/{}?osq=coffee".format(alias) + "&start=" + str(count)
        print(sg_url)
        sg_store = requests.get(sg_url)
        sg_store_soup = bs(sg_store.content, 'lxml')
        
        # get users from tags in page; tag is in this format (<a class="css-1m051bw" href="/user_details?userid=9YLROKnskYk3OVZosiwC7A" role="link">John D.</a>)
        users = [(tag.attrs.get('href').replace('/user_details?userid=', ''), tag.contents[0]) \
                for tag in sg_store_soup.find_all('a',{'class':'css-1m051bw'}) \
                if tag.attrs.get('role') and 'user_details' in tag.attrs.get('href')]

        if not users:
            print(alias, "no reviews")
            outlet_count += 1
            print("{} stores out of {} stores done!".format(outlet_count,len(aliases_sg)))
            sleep_duration = random.randint(30,50)
            print("Resting {} seconds...".format(sleep_duration))
            print(" ")
            time.sleep(sleep_duration)
            break
            
        else:
            users = pd.DataFrame(users)
            users.columns = ['userid', 'author']
            
            # Create dictionary from script
            script = sg_store_soup.find_all('script',{'type':'application/json'})[0].contents[0]
            jsonStr = re.search(r'\{.*\}', str(script)).group()
            dct = json.loads(jsonStr)
            
            # extract review (userid, ratings, text etc) info from dict
            reviews = pd.DataFrame(dct['legacyProps']['bizDetailsProps']['bizDetailsPageProps']['reviewFeedQueryProps']['reviews'])
            reviews = reviews[['business', 'userId', 'user', 'rating', 'localizedDate', 'comment', 'photos']]
            reviews['business'] = reviews['business'].apply(lambda x: x['alias'])
            reviews['username'] = reviews['user'].apply(lambda x: x['markupDisplayName'])
            reviews['language'] = reviews['comment'].apply(lambda x: x['language'])
            reviews['text'] = reviews['comment'].apply(lambda x: x['text'])
            reviews.drop('comment', inplace=True, axis=1)
            reviews.columns = ['shop', 'userid', 'userinfo', 'rating', 'date', 'photos', 'username', 'language', 'text']
            
            all_data = pd.concat([all_data, reviews], axis=0)
            all_data.to_csv('sg_UIRRD.csv', index=False)
            
            count += count_increment

https://www.yelp.com/biz/tcc-the-connoisseur-concerto-singapore-13?osq=coffee&start=0
tcc-the-connoisseur-concerto-singapore-13 no reviews
1 stores out of 15 stores done!
Resting 31 seconds...
 
https://www.yelp.com/biz/cafe-nido-singapore?osq=coffee&start=0
cafe-nido-singapore no reviews
2 stores out of 15 stores done!
Resting 34 seconds...
 
https://www.yelp.com/biz/killiney-kopitiam-singapore-11?osq=coffee&start=0
https://www.yelp.com/biz/killiney-kopitiam-singapore-11?osq=coffee&start=10
killiney-kopitiam-singapore-11 no reviews
3 stores out of 15 stores done!
Resting 44 seconds...
 
https://www.yelp.com/biz/chocolate-mark-singapore?osq=coffee&start=0
chocolate-mark-singapore no reviews
4 stores out of 15 stores done!
Resting 30 seconds...
 
https://www.yelp.com/biz/starbucks-singapore-129?osq=coffee&start=0
https://www.yelp.com/biz/starbucks-singapore-129?osq=coffee&start=10
starbucks-singapore-129 no reviews
5 stores out of 15 stores done!
Resting 46 seconds...
 
https://www.yelp

In [11]:
all_data

Unnamed: 0,shop,userid,userinfo,rating,date,photos,username,language,text
0,killiney-kopitiam-singapore-11,EIwNqeBDMQ4Q6k0DQ30SyA,{'link': '/user_details?userid=EIwNqeBDMQ4Q6k0...,5,4/6/2018,[{'src': 'https://s3-media0.fl.yelpcdn.com/bph...,Hanh T.,en,We came here to try a traditional Singaporean ...
1,killiney-kopitiam-singapore-11,_I4S9ZWAoJ2PDippwT605g,{'link': '/user_details?userid=_I4S9ZWAoJ2PDip...,4,10/14/2017,[],Ityng H.,en,Delayed post. I visited this location back in ...
2,killiney-kopitiam-singapore-11,VxwCgpZGEPNQ7z9ZxFYntw,{'link': '/user_details?userid=VxwCgpZGEPNQ7z9...,4,12/2/2016,[],Nailesh J.,en,Excellent food and great value for money. They...
3,killiney-kopitiam-singapore-11,cb-E5IvWjrWEo-Qm2zxhwg,{'link': '/user_details?userid=cb-E5IvWjrWEo-Q...,4,9/18/2016,[{'src': 'https://s3-media0.fl.yelpcdn.com/bph...,Rob K.,en,This is a great locals place. Great for breakf...
4,killiney-kopitiam-singapore-11,UK2DsID5q3MCqYX23S6xXw,{'link': '/user_details?userid=UK2DsID5q3MCqYX...,4,3/20/2015,[],Patricia M.,en,I have to admit we didn&amp;#39;t really know ...
0,starbucks-singapore-129,Q1oMmm7tKPOzA7_gqcV4zg,{'link': '/user_details?userid=Q1oMmm7tKPOzA7_...,4,8/13/2014,[],Carolyn L.,en,I have heard this is the first Starbucks outle...
1,starbucks-singapore-129,41MtZiv6L0dB4S9NdnO6_g,{'link': '/user_details?userid=41MtZiv6L0dB4S9...,3,8/22/2013,[],Alexander L.,en,"Hey do you like university students? If not, w..."
2,starbucks-singapore-129,37veRneM-BBpXzSNw-jCGw,{'link': '/user_details?userid=37veRneM-BBpXzS...,4,8/19/2012,[],Jeremy H.,en,The Starbucks at Liat Towers is one of my favo...
3,starbucks-singapore-129,Xij-B5G19fXDZ9uYGPZAqg,{'link': '/user_details?userid=Xij-B5G19fXDZ9u...,4,11/28/2013,[],Pearly C.,en,Because it&amp;#39;s thanksgiving i decided to...
4,starbucks-singapore-129,Cd8OdDB6wJxurtAgU-IgHA,{'link': '/user_details?userid=Cd8OdDB6wJxurtA...,4,8/12/2012,[{'src': 'https://s3-media0.fl.yelpcdn.com/bph...,K R.,en,This is probably one of my favorite Starbucks ...


In [12]:
# all_data.to_csv('sg_UIRRD_{}-{}.csv'.format(START_IDX, END_IDX), index=False)
all_data.to_csv('sg_UIRRD_rescrape2_{}-{}.csv'.format(START_IDX, END_IDX), index=False)