Below are the key features we scrape from the Yelp website:
1. Restaurant name
2. Restaurant address
3. Restaurant rating (yelp rating 0-5 with 0.5 increment)
4. Hygiene (official scores: A, B, C)
5. Restaurant neighborhood (Morningside height, East Village, Chelsea, etc.)
6. Category (i.e. cuisine type: Chinese, Japanese, French, American, etc.)
7. Noise Level (Quiet, Noisy, Average)
8. Ambience (Romantic, trendy, etc.)
9. Price range (under  $$10, $11-30, $31-60, ... )
10. Parking Options (Street, Private Lot, Garage, etc)
11. Reservable? (Yes, No)
12. Has Gluten-free Options (Yes, No)
13. Alcohol (Beer & Wine Only, Full Bar, None)

     $ \vdots$
     
We ran the scraping code for the following cuisine types: Chinese, Korean, American, Indian, Japanese, Spanish, French, Italian, Greek, Thai, Mexico, Vietnamese. (change the first argument of the "get_urls_from_search(term, location, num)" function to get urls for each cuisine type)

For each cuisine type, we generate "{cuisine_type}_Restaurant.csv"

**Use the following code to scrape**

In [5]:
from bs4 import BeautifulSoup
import re
from threading import Thread
import urllib
import pandas as pd
import urllib.request
import time
from random import randint
from urllib.request import urlopen, Request

In [6]:
opener = urllib.request.build_opener()
# IE 9 proved to be the most successful
opener.addheaders = [('User-agent', 'IE 9/Windows: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)')]
urllib.request.install_opener(opener)

In [7]:
# Function that will do the scraping job from yelp
def scrape(ur):

    with urllib.request.urlopen(ur) as url:
        html = url.read()
    soup = BeautifulSoup(html,"lxml")
    retaurant_name = soup.find('h1')
    
    # create a dictionary business info for storing key business features 
    business_info = {}
    business_info['restaurant_name']= str(retaurant_name.text.strip().rstrip())
    
    if soup.find('span',itemprop="streetAddress") != None:
        retaurant_address = soup.find('span',itemprop="streetAddress")
        business_info['retaurant_address'] = str(retaurant_address.text.strip().rstrip())
    
    if soup.find('span',itemprop="postalCode") != None:
        restaurant_zipcode = soup.find('span',itemprop="postalCode")
        business_info['restaurant_zipcode'] = str(restaurant_zipcode.text.strip().rstrip())
    
    if soup.find('span',itemprop="reviewCount") != None:
        restaurant_reviewcount = soup.find('span',itemprop="reviewCount")
        business_info['restaurant_reviewcount'] = str(restaurant_reviewcount.text.strip().rstrip())
   
    if soup.find(itemprop="ratingValue") != None:
        business_info['restaurant_rating'] = soup.find(itemprop="ratingValue").get("content")

    if soup.find('span', {'class': 'neighborhood-str-list'}) != None:
        neighborhood = soup.find('span', {'class': 'neighborhood-str-list'})
        business_info['restaurant_neighobrhood'] = str(neighborhood.text.strip().rstrip())
   
    if soup.find('dd',{'class':"nowrap health-score-description"}) != None:
        hygiene_score = soup.find('dd',{'class':"nowrap health-score-description"})
        business_info['Hygiene_score'] = str(hygiene_score.text.strip().rstrip())
        
    if soup.find('dd', {'class':"nowrap price-description"}) != None:
        price_range = soup.find('dd', {'class':"nowrap price-description"})
        business_info['price_range'] = str(price_range.text.strip().rstrip())
   
    if soup.find('div',{'class':'short-def-list'}) != None:
        for i in soup.find('div',{'class':'short-def-list'}).findAll('dl'):
            key = i.find('dt').text.strip().rstrip()
            value = i.find('dd').text.strip().rstrip()
            business_info[str(key)]=str(value)
    
    if soup.find(property="place:location:latitude") != None:
        business_info['latitude'] = soup.find(property="place:location:latitude").get("content")

    if soup.find(property="place:location:longitude") != None:
        business_info['longitude'] = soup.find(property="place:location:longitude").get("content")  
    
    business_info['Category']= ''
    if soup.find('span',{'class':'category-str-list'}) != None:
        for i in soup.find('span',{'class':'category-str-list'}).findAll('a'):
            business_info['Category'] += (str(i.text.strip().rstrip())+'; ')
                
    return business_info

In [8]:
# Example
d = scrape('https://www.yelp.com/biz/mount-everest-indias-cuisine-las-vegas?osq=indian+food')
print(d)

{'restaurant_name': 'Mount Everest India’s Cuisine', 'retaurant_address': '3641 W Sahara Ave', 'restaurant_zipcode': '89102', 'restaurant_reviewcount': '1442', 'restaurant_rating': '4.5', 'restaurant_neighobrhood': 'Westside', 'price_range': '$11-30', 'Has Soy-free Options': 'Yes', 'Has Gluten-free Options': 'Yes', 'Liked by Vegetarians': 'Yes', 'Has Dairy-free Options': 'Yes', 'Has Halal Options': 'Yes', 'Liked by Vegans': 'Yes', 'Takes Reservations': 'Yes', 'Delivery': 'Yes', 'Take-out': 'Yes', 'Accepts Credit Cards': 'Yes', 'Accepts Apple Pay': 'No', 'Accepts Google Pay': 'No', 'Good For': 'Lunch, Dinner', 'Parking': 'Private Lot', 'Bike Parking': 'Yes', 'Wheelchair Accessible': 'Yes', 'Good for Kids': 'Yes', 'Good for Groups': 'Yes', 'Attire': 'Casual', 'Ambience': 'Casual', 'Noise Level': 'Average', 'Alcohol': 'Beer & Wine Only', 'Outdoor Seating': 'No', 'Wi-Fi': 'No', 'Has TV': 'Yes', 'Dogs Allowed': 'No', 'Waiter Service': 'Yes', 'Caters': 'Yes', 'Category': 'Indian; '}


In [None]:
# List of yelp urls to scrape
# change term to change the keyword we want to search
def get_urls_from_search(term, location, num):
    
    term = term.replace(' ','+')
    location = location.replace(' ','+')
    query = 'https://www.yelp.com/search?find_desc='+term+'&find_loc='+location+'&start='+str(num*10)
    with urllib.request.urlopen(query) as url:
        contents = url.read()
    #contents = urllib.urlopen(query).read()
    soup = BeautifulSoup(contents, "html.parser")
    #print(soup)
    business_url = []
    for result in soup.findAll('a',{'class':'biz-name js-analytics-click'}):
        business_url.append("http://www.yelp.com" + result['href'])
    return business_url

In [None]:
# list of all locations in NYC
searchLocations = ['Alphabet_City','Battery_Park','Chelsea','Chinatown','Civic_Center','East_Harlem','East_Village','Financial_District','Flatiron','Gramercy','Greenwich_Village','Harlem','Hell\'s_Kitchen','Inwood','Kips_Bay','Koreatown','Little_Italy','Lower_East_Side','Manhattan_Valley','Marble_Hill','Meatpacking_District','Midtown_East','Midtown_West','Morningside_Heights','Murray_Hill','NoHo','Nolita','Roosevelt_Island','SoHo','South_Street_Seaport','South_Village','Stuyvesant_Town','TriBeCa','Two_Bridges','Union_Square','Upper_East_Side','Upper_West_Side','Washington_Heights','West_Village']

**The following code takes about 20 minutes to run**

In [None]:
# Get NYC Japanese restaurant info
max_num = 5
urls_set = []
for i, loc in enumerate(searchLocations):
    # now run for loop with fix location and food type and append urls 
    # page ranking is based on relevance ranked by yelp
    for num in range(0,max_num):
        urls = get_urls_from_search("Japanese restaurants",loc, num)
        urls = urls[1:] # 0th link is irrelavant
        # len(urls)=0 if the starting page number exceed the maximum possible
        if (len(urls) ==0):
            break
        else:
            for i in range(0,len(urls)-1):
                urls_set.append(urls[i])
                
    # Delays to help reduce queries and reduce the possibility of IP Ban            
    time.sleep(5)

#check urls_set lenght
print(len(urls_set))
urls_set

In [None]:
pd_urls_set = pd.DataFrame(urls_set)
url = pd_urls_set.drop_duplicates().values.tolist() # drop duplicates to improve efficiency
urls = [i[0] for i in url]
# check number of restaurants pages we need to scrape
print(len(urls))

**It takes the scrape function about 20-30 seconds to scrape each webpage**

**So, the following code takes about (20$\times$ len(urls)) seconds to run**

e.g. if we have 1000 urls to run then the following code takes about 6 hours to run.

In [None]:
# header contains the info we want to scrape
header=['restaurant_name', 'retaurant_address', 'restaurant_zipcode', 'restaurant_reviewcount', 'restaurant_rating', 'restaurant_neighobrhood', 'Hygiene_score', 'price_range', 'Liked by Vegetarians', 'Takes Reservations', 'Delivery', 'Take-out', 'Accepts Credit Cards', 'Accepts Bitcoin', 'Parking', 'Bike Parking', 'Wheelchair Accessible', 'Good for Kids', 'Good for Groups', 'Attire', 'Noise Level', 'Alcohol', 'Happy Hour', 'Outdoor Seating', 'Wi-Fi', 'Has TV', 'Dogs Allowed', 'Waiter Service', 'Caters', 'Category', 'Has Soy-free Options', 'Has Dairy-free Options', 'Liked by Vegans', 'Has Gluten-free Options', 'Good For', 'Ambience', 'Gender Neutral Restrooms']
info={}
for u in urls:
    url_dict=scrape(u)
    for i in header:
        if i in url_dict.keys():
            info.setdefault(i,[]).append(url_dict[i])
        else:
            info.setdefault(i,[]).append('NA')
    time.sleep(2)

In [None]:
# Get NYC American restaurant info
max_num = 5
urls_set = []
for i, loc in enumerate(searchLocations):
    # now run for loop with fix location and food type and append urls 
    # page ranking is based on relevance ranked by yelp
    for num in range(0,max_num):
        urls = get_urls_from_search("American restaurants",loc, num)
        urls = urls[1:] # 0th link is irrelavant
        # len(urls)=0 if the starting page number exceed the maximum possible
        if (len(urls) ==0):
            break
        else:
            for i in range(0,len(urls)-1):
                urls_set.append(urls[i])
                
    # Delays to help reduce queries and reduce the possibility of IP Ban            
    time.sleep(5)
print(len(urls_set))
urls_set

In [None]:
pd_urls_set = pd.DataFrame(urls_set)
url = pd_urls_set.drop_duplicates().values.tolist() # drop duplicates to improve efficiency
urls = [i[0] for i in url]

In [None]:
header=['restaurant_name', 'retaurant_address', 'restaurant_zipcode', 'restaurant_reviewcount', 'restaurant_rating', 'restaurant_neighobrhood', 'Hygiene_score', 'price_range', 'Liked by Vegetarians', 'Takes Reservations', 'Delivery', 'Take-out', 'Accepts Credit Cards', 'Accepts Bitcoin', 'Parking', 'Bike Parking', 'Wheelchair Accessible', 'Good for Kids', 'Good for Groups', 'Attire', 'Noise Level', 'Alcohol', 'Happy Hour', 'Outdoor Seating', 'Wi-Fi', 'Has TV', 'Dogs Allowed', 'Waiter Service', 'Caters', 'Category', 'Has Soy-free Options', 'Has Dairy-free Options', 'Liked by Vegans', 'Has Gluten-free Options', 'Good For', 'Ambience', 'Gender Neutral Restrooms']
info={}
for u in urls_set:
    url_dict=scrape(u)
    for i in header:
        if i in url_dict.keys():
            info.setdefault(i,[]).append(url_dict[i])
        else:
            info.setdefault(i,[]).append('NA')
    # Delays to help reduce queries and reduce the possibility of IP Ban
    time.sleep(2)

In [None]:
df = pd.DataFrame(info)
df.to_csv('American_Restaurant.csv') 

On our local machines, we generate csv by replacing the term with:
Chinese, Korean, American, Indian, Japanese, Spanish, French, Italian, Greek, Thai, Mexico, Vietnamese.

Both "get_urls_from_search" and "scrape" are easy to get IP ban from Yelp, so we need to change IP to get all the above cuisine types, which can be very time consuming.