# Web Scraper

Web scraping is one of most important data collection skills. Without worrying about if there is an available dataset, we can collect the data on our own. 

This notebook includes a `YelpScraper` object which takes an api_key of a user to authenticate the access to the api and allow the class functions to work.

In [1]:
import time, json, re, requests
from bs4 import BeautifulSoup

In [2]:
with open('utility/API_key.txt', 'r') as f:
    api_key = f.read().replace('\n','')

In [3]:
class YelpScraper(object):
    
    def __init__(self, api_key):
        self.api = api_key
    
    def all_restaurants(self, query):
        '''
        Return all restaurants in the given query location.
        '''
        url = 'https://api.yelp.com/v3/businesses/search'
        headers = {'authorization': 'Bearer %s' % self.api_key}
        params = {'categories':'restaurants','location': query,'limit':50,'offset':0}
        restaurant = []
        while 1 :
            response = requests.get(url,
                                    headers=headers,
                                    params=params)
            for bus in response.json()['businesses']:
                restaurant.append(bus)
            params['offset'] += 50
            if len(restaurant) >= response.json()['total']:
                break
            time.sleep(0.00002)
        return restaurant
    
    def parse_page(self, html):
        '''
        Parse reviews in a page such that each review is formatted as:
        {
            'text': review content
            'date': date of review, 
            'user_id': user's id, 
            'review_id': review's id, 
            'rating': type float
        }
        '''
        soup = BeautifulSoup(html,"html.parser")
        pretty = soup.prettify()
        reviews = re.findall(r'"description": "(.*?)"',pretty)
        ratings = re.findall(r'"reviewRating": {"ratingValue": ([0-9])}',pretty)
        ratings = list(map(lambda x:round(float(x),1),ratings))
        dates = re.findall(r'datePublished": "([^"]+)"',pretty)

        review_ids = []
        user_ids = []
        for review_id,user_id in re.findall(r'data-review-id="([^"]+)" data-signup-object="user_id:([^"]+)"',pretty):
            review_ids.append(review_id)
            user_ids.append(user_id)
        result = []
        for date, rating, review_id, user_id, review in zip(dates, ratings, review_ids, user_ids, reviews):
            date = date[:4] + '/' + date[5:7] + '/' + date[8:]
            temp = {}
            temp["review_id"] = review_id
            temp["user_id"] = user_id
            temp["rating"] = rating
            temp["date"] = date
            temp["text"] = review
            result.append(temp)
            
        next_page = re.search(r'<link href="([^"]+)" rel="next"',pretty)
        if next_page:
            next_page = next_page.group(1)
        return result,next_page
    
    def extract_reviews(self, url):
        '''
        Return all the reviews in a single restaurant specified by the url.
        '''
        data = []    
        response = requests.get(url)
        result,next_page = self.parse_page(response.text)
        data += result
        if not next_page:
            return data
        while next_page:
            result, next_page = self.parse_page(get_html(next_page))
            data += result
        return data

## Parse Yelp Restaurant Page

In [4]:
yelp = YelpScraper(api_key)

In [7]:
reviews = yelp.extract_reviews('https://www.yelp.com/biz/piazza-talarico-and-papa-joes-wine-cellar-pittsburgh')

In [9]:
reviews[0]

{'review_id': 'DyQXngg3Hs2W6gYg6Yhm6A',
 'user_id': 'aGxySE4K3XEFMQUpIFpScg',
 'rating': 5.0,
 'date': '2018/02/23',
 'text': "We have been wanting to go for some time now and hey extended their hours to 9 so we went! First they have homemade wine and it was amazing! Try the tattan wine (spelling?!) It's fantastic! \\n\\nWe got the pizza ( crust is perfect), greens and beans - highly recommend and grandmas rigatoni. The sauce is super good - just the right amount of sweet and the meatballs are delicious! \\n\\nThe service is on par, and they pay their servers a living wage so no tipping - overall really love the concept and the food. We will be back!"}