# KE5204 New Media & Sentiment Mining - Web Scrapping of Yelp Restaurant Reviews

## Load libraries

In [1]:
import requests
from bs4 import BeautifulSoup as soup
import re
import json
import os

## Explore the standard training set

In [10]:
with open(os.path.join('data', 'yelp_2.json')) as read_file:
    std_reviews = json.load(read_file)

In [12]:
std_reviews.keys()

dict_keys(['0', '20'])

In [14]:
std_reviews['0'].keys()

dict_keys(['aggregateRating', 'review', 'servesCuisine', 'priceRange', 'name', 'address', '@context', 'image', '@type', 'telephone'])

In [15]:
std_reviews['0']['aggregateRating']

{'@type': 'AggregateRating', 'ratingValue': 4.5, 'reviewCount': 36}

In [17]:
std_reviews['0']['review'][0]

{'author': 'Tanmay A.',
 'datePublished': '2017-05-19',
 'description': 'The service at Sushi Bar was great with an extremely polite and conscientious staff. Decor of the location was very intimate but also great for larger groups.\n\nCame here on my birthday and ordered tuna sashimi (comes with three large pieces of raw fish), crab stick roll, and the salmon aburi. All of the sushi tasted fresh and was of high quality for a fair price. They even put a little candle on the sushi, so I kinda had to give it 5 stars.\n\nWould definitely recommend!',
 'reviewRating': {'ratingValue': 5}}

In [19]:
std_reviews['0']['servesCuisine']

'Sushi Bars'

In [20]:
std_reviews['0']['priceRange']

'SGD16-30'

In [21]:
std_reviews['0']['name']

'The Sushi Bar'

In [22]:
std_reviews['0']['address']

{'addressCountry': 'SG',
 'addressLocality': 'Singapore',
 'addressRegion': None,
 'postalCode': '228213',
 'streetAddress': '14 Scotts Road\n#04-28, Far East Plaza'}

In [23]:
std_reviews['0']['@context']

'http://schema.org/'

In [24]:
std_reviews['0']['image']

'https://s3-media1.fl.yelpcdn.com/bphoto/Yxmi8g683MnWaOv82YWaDw/ls.jpg'

In [27]:
std_reviews['0']['@type']

'Restaurant'

In [26]:
std_reviews['0']['telephone']

'+6596536464'

## Scrape the Reviews for 1 Restaurant

In [2]:
def removeIndent(phrase):
    phrase=re.sub("\n",' ',phrase)
    phrase=re.sub("\r",' ',phrase)
    phrase=re.sub("\t",' ',phrase)
    return phrase

In [83]:
yelp_url = "https://www.yelp.com/biz/din-tai-fung-singapore-4"

In [84]:
url_content = requests.get(yelp_url)
page = url_content.content.decode('utf-8','ignore')
soup_page = soup(page, 'html.parser')
data = soup_page.findAll("script", type="application/ld+json")

In [87]:
len(data)

4

In [88]:
data[0].text

'            {"@context": "http://schema.org", "itemListElement": [{"position": 1, "@type": "ListItem", "item": {"url": "/c/singapore/restaurants", "name": "Restaurants"}}, {"position": 2, "@type": "ListItem", "item": {"url": "/c/singapore/chinese", "name": "Chinese"}}, {"position": 3, "@type": "ListItem", "item": {"url": "/c/singapore/shanghainese", "name": "Shanghainese"}}], "@type": "BreadcrumbList"}\n'

There are multiple <script> tags of which only 1 has the reviews.

In [89]:
reviews = [ removeIndent(i.text.lstrip().rstrip()) for i in data ]

In [90]:
# Search for the contents of the script tag which has the review data. The review data has the key aggregateRating.
for i in reviews:
    jsondata = json.loads(i)
    if 'aggregateRating' in jsondata:
        break

In [91]:
jsondata

{'@context': 'http://schema.org/',
 '@type': 'Restaurant',
 'address': {'addressCountry': 'SG',
  'addressLocality': 'Singapore',
  'addressRegion': None,
  'postalCode': '179103',
  'streetAddress': 'Raffles City Shopping Centre\n252 North Bridge Road, #B1-08'},
 'aggregateRating': {'@type': 'AggregateRating',
  'ratingValue': 4.0,
  'reviewCount': 45},
 'image': 'https://s3-media4.fl.yelpcdn.com/bphoto/Gyo3iSEL9XCTPy2XcTNSGg/ls.jpg',
 'name': 'Din Tai Fung',
 'priceRange': 'SGD16-30',
 'review': [{'author': 'Margaux L.',
   'datePublished': '2015-10-22',
   'description': 'Before traveling to Singapore for work, I checked out some Yelp reviews to see what the restaurants close to my hotel offered. I\'m glad I read reviews of Din Tai Fung because I made the effort to find it and ended up going three times during a two week trip! My hotel (the Swissotel) was connected to the Raffles City mall, so that made it very convenient, but I still would have made the effort to go back. \n\nThere

## Try scraping the reviews on the next page

In [92]:
url_content = requests.get(yelp_url + '?start=20')
page = url_content.content.decode('utf-8','ignore')
soup_page = soup(page, 'html.parser')
data = soup_page.findAll("script", type="application/ld+json")

In [93]:
reviews2 = [ removeIndent(i.text.lstrip().rstrip()) for i in data ]
for i in reviews2:
    jsondata2 = json.loads(i)
    if 'aggregateRating' in jsondata2:
        break

In [94]:
jsondata2

{'@context': 'http://schema.org/',
 '@type': 'Restaurant',
 'address': {'addressCountry': 'SG',
  'addressLocality': 'Singapore',
  'addressRegion': None,
  'postalCode': '179103',
  'streetAddress': 'Raffles City Shopping Centre\n252 North Bridge Road, #B1-08'},
 'aggregateRating': {'@type': 'AggregateRating',
  'ratingValue': 4.0,
  'reviewCount': 45},
 'image': 'https://s3-media4.fl.yelpcdn.com/bphoto/Gyo3iSEL9XCTPy2XcTNSGg/ls.jpg',
 'name': 'Din Tai Fung',
 'priceRange': 'SGD16-30',
 'review': [{'author': 'Dru C.',
   'datePublished': '2012-10-23',
   'description': "I've eaten at the DTF location in Arcadia, CA in the states plenty of times and I'm a big fan for only their trademark pork XLB's and nothing else. Naturally, once I found out that there was a location minutes away from my hotel in Singapore, I just had to check it out for comparison purposes. \n\nPrices are definitely higher, which is no shocking surprise. In the US, it's around $7-8 for a 10 pc of the juicy pork dump

We will need to consolidate the reviews from multiple pages into 1 json file.

## Scrape Reviews for a List of Restaurants

In [3]:
yelp_urls = ["https://www.yelp.com/biz/tim-ho-wan-singapore-7?start=",
             "https://www.yelp.com/biz/din-tai-fung-singapore-4?start=",
             "https://www.yelp.com/biz/lei-garden-singapore-2?start=",
             "https://www.yelp.com/biz/jai-thai-singapore?start=",
             "https://www.yelp.com/biz/yhingthai-palace-singapore-2?start=",
             "https://www.yelp.com/biz/first-thai-food-singapore?start=",
             "https://www.yelp.com/biz/standing-sushi-bar-singapore?start=",
             "https://www.yelp.com/biz/menya-musashi-singapore?start=",
             "https://www.yelp.com/biz/osaka-ohsho-singapore?start=",
             "https://www.yelp.com/biz/garibaldi-italian-restaurant-and-bar-singapore-2?start=",
             "https://www.yelp.com/biz/trattoria-lafiandra-singapore-2?start=",
             "https://www.yelp.com/biz/prego-singapore?start="]

In [4]:
for yelp_url in yelp_urls:

    # Open the file to save the reviews.
    ffile = open(os.path.join('data', 'yelp_' + yelp_url.split('/')[-1] + '.json'), "w")

    # Get the reviews on the first page.
    url_content = requests.get(yelp_url)
    page = url_content.content.decode('utf-8', 'ignore')
    soup_page = soup(page, 'html.parser')
    data = soup_page.findAll("script", type="application/ld+json")

    reviews = [removeIndent(i.text.lstrip().rstrip()) for i in data]
    for i in reviews:
        jsondata = json.loads(i)
        if 'aggregateRating' in jsondata:
            break

    # Get the reviews from the next pages.
    page_counter = 20

    while True:

        yelp_url_nextpage = yelp_url + str(page_counter)
        url_content = requests.get(yelp_url_nextpage)

        try:
            url_content.raise_for_status()

            page = url_content.content.decode('utf-8', 'ignore')
            soup_page = soup(page, 'html.parser')
            data = soup_page.findAll("script", type="application/ld+json")

            reviews = [removeIndent(i.text.lstrip().rstrip()) for i in data]
            found = False
            for i in reviews:
                jsondata_nextpage = json.loads(i)
                if 'aggregateRating' in jsondata_nextpage:
                    found = True
                    break

            # If no more reviews are available, the review list will be empty.
            if found and jsondata_nextpage['review']:
                # Consolidate the new reviews with those already scraped.
                jsondata.update({'review': jsondata['review'] + jsondata_nextpage['review']})
            else:
                # No more reviews.
                break

            page_counter += 20

        except requests.exceptions.HTTPError:
            break
            
    # Save all tehe reviews to file.
    json.dump(jsondata, ffile)
    ffile.close()