In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests
import os

In [2]:
def build_url():
    
    place = input("Please enter the name of the place (city, State) you want to search restaurants in (e.g. \"Fremont, CA\"): ")
    lst = [x.strip() for x in place.split(',')]
    if len(lst[0].split())>1:
        lst[0] ='+'.join(lst[0].split())
    
    baseurl = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc='
    url = baseurl +lst[0]+',+'+lst[1]
    
    return url

In [3]:
def query_restaurant(num_restaurant=10):
    
    import urllib.request, urllib.parse, urllib.error
    from bs4 import BeautifulSoup
    import ssl
    import pandas as pd
    
    num_loop_restaurant = 1+int(num_restaurant/11)
    
    url = build_url()
    
    if num_loop_restaurant==1:
        soup=read_soup_HTML(url)
        restaurant_names = build_restaurant_names(soup)
        restaurant_links = build_restaurant_links(soup)
    else:
        soup=read_soup_HTML(url)
        restaurant_names = build_restaurant_names(soup)
        restaurant_links = build_restaurant_links(soup)
        for i in range(1,num_loop_restaurant):
            url = url+'&start='+str(i*10)
            soup=read_soup_HTML(url)
            restaurant_names.extend(build_restaurant_names(soup))
            restaurant_links.extend(build_restaurant_links(soup))
    
    df=pd.DataFrame(data={'Link':restaurant_links,'Name':restaurant_names})
    print(df.iloc[:num_restaurant])
    
    return df.iloc[:num_restaurant]

In [4]:
def read_soup_HTML(url):
    
    import urllib.request, urllib.parse, urllib.error
    from bs4 import BeautifulSoup
    import ssl

    # Ignore SSL certificate errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    # Read the HTML from the URL and pass on to BeautifulSoup
    #print("Opening the page", url)
    uh= urllib.request.urlopen(url, context=ctx)
    html =uh.read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [5]:
def build_restaurant_names (soup):
    restaurant_names = []
    for span in soup.find_all('span'):
        if 'class' in span.attrs:
            if span.attrs['class']==['indexed-biz-name']:
                restaurant_names.append(span.contents[1].get_text())
    
    return restaurant_names

In [6]:
def build_restaurant_links (soup):
    restaurant_links=[]
    for a in soup.find_all('a'):
        if 'class' in a.attrs:
            #print(a.attrs)
            if a.attrs['class']==['js-analytics-click']:
                restaurant_links.append(a.attrs['href'])
    _=restaurant_links.pop(0)
    
    for i in range(len(restaurant_links)):
        link='https://yelp.com'+restaurant_links[i]
        restaurant_links[i]=link
    
    return restaurant_links

In [7]:
query_restaurant(num_restaurant=15)

Please enter the name of the place (city, State) you want to search restaurants in (e.g. "Fremont, CA"): San Jose, CA
                                                 Link  \
0   https://yelp.com/biz/firehouse-no-1-gastropub-...   
1   https://yelp.com/biz/the-table-san-jose?frvs=T...   
2   https://yelp.com/biz/braise-san-jose?frvs=True...   
3   https://yelp.com/biz/the-grandview-restaurant-...   
4   https://yelp.com/biz/ludwigs-german-table-san-...   
5   https://yelp.com/biz/black-sheep-brasserie-san...   
6   https://yelp.com/biz/pot-belly-ramen-san-jose?...   
7   https://yelp.com/biz/taurinus-brazilian-steak-...   
8   https://yelp.com/biz/our-house-san-jose-2?frvs...   
9   https://yelp.com/biz/orchard-city-kitchen-camp...   
10  https://yelp.com/biz/back-a-yard-san-jose?frvs...   
11  https://yelp.com/biz/gen-korean-bbq-house-san-...   
12  https://yelp.com/biz/vietnoms-san-jose?frvs=Tr...   
13  https://yelp.com/biz/smoking-pig-bbq-san-jose?...   
14  https://yelp.com/biz/ou

Unnamed: 0,Link,Name
0,https://yelp.com/biz/firehouse-no-1-gastropub-...,Firehouse No.1 Gastropub
1,https://yelp.com/biz/the-table-san-jose?frvs=T...,The Table
2,https://yelp.com/biz/braise-san-jose?frvs=True...,Braise
3,https://yelp.com/biz/the-grandview-restaurant-...,The Grandview Restaurant
4,https://yelp.com/biz/ludwigs-german-table-san-...,Ludwigs German Table
5,https://yelp.com/biz/black-sheep-brasserie-san...,Black Sheep Brasserie
6,https://yelp.com/biz/pot-belly-ramen-san-jose?...,Pot Belly Ramen
7,https://yelp.com/biz/taurinus-brazilian-steak-...,Taurinus Brazilian Steak House
8,https://yelp.com/biz/our-house-san-jose-2?frvs...,Our House
9,https://yelp.com/biz/orchard-city-kitchen-camp...,Orchard City Kitchen


In [15]:
def gather_reviews(df,num_reviews):
    
    reviews={}
    num_links=df.shape[0]
    num_loop_reviews = 1+int(num_reviews/21)
    for i in range(num_links):
        print(f"Gathering top reviews on {df.iloc[i]['Name']} now...")
        if num_loop_reviews==1:
            review_text=[]
            url=df.iloc[i]['Link']
            soup=read_soup_HTML(url)
            for p in soup.find_all('p'):
                if 'itemprop' in p.attrs:
                    if p.attrs['itemprop']=='description':
                        text=p.get_text().strip()
                        review_text.append(text)
        else:
            review_text=[]
            url=df.iloc[i]['Link']
            soup=read_soup_HTML(url)
            for p in soup.find_all('p'):
                if 'itemprop' in p.attrs:
                    if p.attrs['itemprop']=='description':
                        text=p.get_text().strip()
                        review_text.append(text)
            for i in range(1,num_loop_reviews):
                url=df.iloc[i]['Link']+'?start='+str(20*i)
                soup=read_soup_HTML(url)
                for p in soup.find_all('p'):
                    if 'itemprop' in p.attrs:
                        if p.attrs['itemprop']=='description':
                            text=p.get_text().strip()
                            review_text.append(text)
        
        reviews[str(df.iloc[i]['Name'])]=review_text[:num_reviews]
    
    return reviews

In [16]:
def get_reviews(num_restaurant=10,num_reviews=20):
    df_restaurants = query_restaurant(num_restaurant=num_restaurant)
    reviews = gather_reviews(df_restaurants,num_reviews=num_reviews)
    
    return reviews

### Test cases

In [17]:
rev = get_reviews(5,5)

Please enter the name of the place (city, State) you want to search restaurants in (e.g. "Fremont, CA"): San Jose, CA
                                                Link                      Name
0  https://yelp.com/biz/firehouse-no-1-gastropub-...  Firehouse No.1 Gastropub
1  https://yelp.com/biz/braise-san-jose?frvs=True...                    Braise
2  https://yelp.com/biz/the-table-san-jose?frvs=T...                 The Table
3  https://yelp.com/biz/cajun-bistro-7-san-jose-2...            Cajun Bistro 7
4  https://yelp.com/biz/the-grandview-restaurant-...  The Grandview Restaurant
Gathering top reviews on Firehouse No.1 Gastropub now...
Gathering top reviews on Braise now...
Gathering top reviews on The Table now...
Gathering top reviews on Cajun Bistro 7 now...
Gathering top reviews on The Grandview Restaurant now...


In [18]:
count=0
for r in rev['The Table']:
    print(r)
    print("="*100)
    count+=1
print("\n Review count:", count)

Service (5/5): Our waitress made sure our bottomless mimosas were constantly full lol. It was pretty busy but the servers were on it! Every server was willing to cater to any of our needs. 

Ambiance (5/5): Dog friendly, open windows, cute decor, and an outdoor area. 

Food (5/5): B--M-B. We ordered the chilaquiles, pork fried steak, salmon Benedict, and the loco moco. I didn't try the first two dishes I mentioned but out of the Benedict and loco moco my favorite was definitely the latter. If you like island food I highly recommend. 

Price (4/5): My advice is to eat slow. The price was definitely worth its quality but the quantity isn't very big. My group was pretty full but it's because of the mimosas and we ordered monkey bread and beignets.
Our experience at The Table was OK, but I had higher expectations for a popular brunch place with 2,000+ reviews.  

I knew the wait was going to be a nightmare for a weekend brunch, and they don't take reservations.  However, luckily I picked u