## A Restaurant Recommendation System based on Yelp Information ##

**Project Goal**: Create an NYC restaurant information database and use the database to recommend users restaurants which match with their preferences.

Users can input features that indicate their preferences and our machine can output top 5 restaurants from as our recommendation for the user.

**Steps**:


In [207]:
from bs4 import BeautifulSoup
import re
from threading import Thread
import urllib
import pandas as pd
import urllib.request
import time
from random import randint

In [134]:
opener = urllib.request.build_opener()
# IE 9 proved to be the most successful
opener.addheaders = [('User-agent', 'IE 9/Windows: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)')]
urllib.request.install_opener(opener)

In [200]:
# Function that will do the scraping job from yelp
def scrape(ur):

    with urllib.request.urlopen(ur) as url:
        html = url.read()
    soup = BeautifulSoup(html,"lxml")
    retaurant_name = soup.find('h1')
    
    # create a dictionary business info for storing key business features 
    business_info = {}
    business_info['restaurant_name']= str(retaurant_name.text.strip().rstrip())
    
    if soup.find('span',itemprop="streetAddress") != None:
        retaurant_address = soup.find('span',itemprop="streetAddress")
        business_info['retaurant_address'] = str(retaurant_address.text.strip().rstrip())
    
    if soup.find('span',itemprop="postalCode") != None:
        restaurant_zipcode = soup.find('span',itemprop="postalCode")
        business_info['restaurant_zipcode'] = str(restaurant_zipcode.text.strip().rstrip())
    
    if soup.find('span',itemprop="reviewCount") != None:
        restaurant_reviewcount = soup.find('span',itemprop="reviewCount")
        business_info['restaurant_reviewcount'] = str(restaurant_reviewcount.text.strip().rstrip())
   
    if soup.find(itemprop="ratingValue") != None:
        business_info['restaurant_rating'] = soup.find(itemprop="ratingValue").get("content")

    if soup.find('span', {'class': 'neighborhood-str-list'}) != None:
        neighborhood = soup.find('span', {'class': 'neighborhood-str-list'})
        business_info['restaurant_neighobrhood'] = str(neighborhood.text.strip().rstrip())
   
    if soup.find('dd',{'class':"nowrap health-score-description"}) != None:
        hygiene_score = soup.find('dd',{'class':"nowrap health-score-description"})
        business_info['Hygiene_score'] = str(hygiene_score.text.strip().rstrip())
        
    if soup.find('dd', {'class':"nowrap price-description"}) != None:
        price_range = soup.find('dd', {'class':"nowrap price-description"})
        business_info['price_range'] = str(price_range.text.strip().rstrip())
   
    if soup.find('div',{'class':'short-def-list'}) != None:
        for i in soup.find('div',{'class':'short-def-list'}).findAll('dl'):
            key = i.find('dt').text.strip().rstrip()
            value = i.find('dd').text.strip().rstrip()
            business_info[str(key)]=str(value)
    
    if soup.find(property="place:location:latitude") != None:
        business_info['latitude'] = soup.find(property="place:location:latitude").get("content")

    if soup.find(property="place:location:longitude") != None:
        business_info['longitude'] = soup.find(property="place:location:longitude").get("content")  
    
    business_info['Category']= ''
    if soup.find('span',{'class':'category-str-list'}) != None:
        for i in soup.find('span',{'class':'category-str-list'}).findAll('a'):
            business_info['Category'] += (str(i.text.strip().rstrip())+'; ')
                
    return business_info

In [201]:
# Example for the scrape function
d = scrape('https://www.yelp.com/biz/blue-ribbon-sushi-new-york?osq=blue+ribbon')
print(d)

{'restaurant_name': 'Blue Ribbon Sushi', 'retaurant_address': '119 Sullivan St', 'restaurant_zipcode': '10012', 'restaurant_reviewcount': '1023', 'restaurant_rating': '4.0', 'restaurant_neighobrhood': 'South Village', 'Hygiene_score': 'A', 'price_range': '$31-60', 'Takes Reservations': 'No', 'Delivery': 'No', 'Take-out': 'Yes', 'Accepts Credit Cards': 'Yes', 'Accepts Apple Pay': 'No', 'Good For': 'Dinner', 'Parking': 'Street', 'Bike Parking': 'Yes', 'Wheelchair Accessible': 'No', 'Good for Kids': 'No', 'Good for Groups': 'Yes', 'Attire': 'Casual', 'Noise Level': 'Average', 'Alcohol': 'Full Bar', 'Outdoor Seating': 'No', 'Wi-Fi': 'No', 'Has TV': 'No', 'Waiter Service': 'Yes', 'Caters': 'No', 'Gender Neutral Restrooms': 'Yes', 'Category': 'Sushi Bars; Japanese; '}


In [137]:
# List of yelp urls to scrape
def get_urls_from_search(term, location, num):
    
    term = term.replace(' ','+')
    location = location.replace(' ','+')
    query = 'https://www.yelp.com/search?find_desc='+term+'&find_loc='+location+'&start='+str(num*10)
    with urllib.request.urlopen(query) as url:
        contents = url.read()
    #contents = urllib.urlopen(query).read()
    soup = BeautifulSoup(contents, "html.parser")
    #print(soup)
    business_url = []
    for result in soup.findAll('a',{'class':'biz-name js-analytics-click'}):
        business_url.append("http://www.yelp.com" + result['href'])
    return business_url


In [164]:
# List of all locations
# 'Alphabet_City','Battery_Park','Chelsea','Chinatown','Civic_Center','East_Harlem','East_Village','Financial_District','Flatiron','Gramercy','Greenwich_Village','Harlem','Hell\'s_Kitchen','Inwood','Kips_Bay','Koreatown','Little_Italy','Lower_East_Side','Manhattan_Valley','Marble_Hill','Meatpacking_District','Midtown_East','Midtown_West','Morningside_Heights','Murray_Hill','NoHo','Nolita','Roosevelt_Island','SoHo','South_Street_Seaport','South_Village','Stuyvesant_Town','Theater_District','TriBeCa','Two_Bridges','Union_Square','Upper_East_Side','Upper_West_Side','Washington_Heights','West_Village', 'Yorkville'

# Yorkville entirely encompassed by Upper East Side
# Theater District entirely encompassed by Midtown West

searchLocations = ['Alphabet_City','Battery_Park','Chelsea','Chinatown','Civic_Center','East_Harlem','East_Village','Financial_District','Flatiron','Gramercy','Greenwich_Village','Harlem','Hell\'s_Kitchen','Inwood','Kips_Bay','Koreatown','Little_Italy','Lower_East_Side','Manhattan_Valley','Marble_Hill','Meatpacking_District','Midtown_East','Midtown_West','Morningside_Heights','Murray_Hill','NoHo','Nolita','Roosevelt_Island','SoHo','South_Street_Seaport','South_Village','Stuyvesant_Town','TriBeCa','Two_Bridges','Union_Square','Upper_East_Side','Upper_West_Side','Washington_Heights','West_Village']

#test
#searchLocations = ['East_Village', 'Upper_West_Side', 'Chelsea'] 

In [159]:
max_num = 5
urls_set = []
for i, loc in enumerate(searchLocations):
    # now run for loop with fix location and food type and append urls 
    # page ranking is based on relevance ranked by yelp
    for num in range(0,max_num):
        urls = get_urls_from_search("Japanese restaurants",loc, num)
        urls = urls[1:] # 0th link is irrelavant
        # len(urls)=0 if the starting page number exceed the maximum possible
        if (len(urls) ==0):
            break
        else:
            for i in range(0,len(urls)-1):
                urls_set.append(urls[i])
                
    # Delays to help reduce queries and reduce the possibility of IP Ban            
    time.sleep(5)
    #convert the urls from list to csv file for each location
    #pd_urls_set = pd.DataFrame(urls_set)
    #pd_urls_set.to_csv("urls_set_{0}.csv".format(loc))

In [160]:
print(len(urls_set))
urls_set

0


[]

In [22]:
info = {} # create a dictionary for srestaurant info
for u in urls_set:
    url_dict=scrape(u)
    for key,value in url_dict.items():
        info.setdefault(key,[]).append(value)

In [121]:
#header

dict_keys(['restaurant_name', 'retaurant_address', 'restaurant_zipcode', 'restaurant_reviewcount', 'restaurant_rating', 'Hygiene_score', 'price_range', 'Takes Reservations', 'Delivery', 'Take-out', 'Accepts Credit Cards', 'Accepts Apple Pay', 'Accepts Google Pay', 'Accepts Bitcoin', 'Good For', 'Parking', 'Bike Parking', 'Wheelchair Accessible', 'Good for Kids', 'Good for Groups', 'Attire', 'Ambience', 'Noise Level', 'Alcohol', 'Happy Hour', 'Outdoor Seating', 'Wi-Fi', 'Has TV', 'Dogs Allowed', 'Waiter Service', 'Caters', 'Gender Neutral Restrooms', 'Has Dairy-free Options', 'Liked by Vegetarians', 'Liked by Vegans', 'Good For Dancing', 'Best Nights', 'Good for Working', 'Has Pool Table', 'Open to All', 'Coat Check', 'Has Soy-free Options', 'Has Gluten-free Options', 'Smoking', 'Has Kosher Options', 'Offers Military Discount'])

In [122]:
header=['restaurant_name', 'retaurant_address', 'restaurant_zipcode', 'restaurant_reviewcount', 'restaurant_rating', 'Hygiene_score', 'price_range', 'Takes Reservations', 'Delivery', 'Take-out', 'Accepts Credit Cards', 'Accepts Apple Pay', 'Accepts Google Pay', 'Accepts Bitcoin', 'Good For', 'Parking', 'Bike Parking', 'Wheelchair Accessible', 'Good for Kids', 'Good for Groups', 'Attire', 'Ambience', 'Noise Level', 'Alcohol', 'Happy Hour', 'Outdoor Seating', 'Wi-Fi', 'Has TV', 'Dogs Allowed', 'Waiter Service', 'Caters', 'Gender Neutral Restrooms', 'Has Dairy-free Options', 'Liked by Vegetarians', 'Liked by Vegans', 'Good For Dancing', 'Best Nights', 'Good for Working', 'Has Pool Table', 'Open to All', 'Coat Check', 'Has Soy-free Options', 'Has Gluten-free Options', 'Smoking', 'Has Kosher Options', 'Offers Military Discount']
info={}
for u in urls_set:
    url_dict=scrape(u)
    for i in header:
        if i in url_dict.keys():
            info.setdefault(i,[]).append(url_dict[i])
        else:
            info.setdefault(i,[]).append('NA')
    # Delays to help reduce queries and reduce the possibility of IP Ban
    time.sleep(5)

In [170]:
final_df_sample=pd.DataFrame(info)
#final_df_sample

In [171]:
final_df_sample.to_csv('sample japnese.csv') # save to csv

In [155]:
len(final_df)
final_df.head()

Unnamed: 0,Accepts Apple Pay,Accepts Bitcoin,Accepts Credit Cards,Accepts Google Pay,Alcohol,Ambience,Attire,Best Nights,Bike Parking,Caters,...,Takes Reservations,Waiter Service,Wheelchair Accessible,Wi-Fi,price_range,restaurant_name,restaurant_rating,restaurant_reviewcount,restaurant_zipcode,retaurant_address
0,No,No,Yes,No,Beer & Wine Only,"Trendy, Intimate",Casual,,No,No,...,No,Yes,No,No,$11-30,Benemon,4.5,432,10003,108 E 4th St
1,,,Yes,,,,Casual,,,,...,Yes,Yes,,,,Pado,5.0,14,10003,199 2nd Ave
2,No,,No,No,Beer & Wine Only,Casual,Casual,,Yes,No,...,No,Yes,No,No,$11-30,Raku,4.5,776,10003,342 E 6th St
3,No,,Yes,,Beer & Wine Only,Casual,Casual,,No,No,...,Yes,Yes,No,No,$11-30,Izakaya,4.0,230,10003,326 E 6th St
4,,,Yes,,Beer & Wine Only,Casual,Casual,,Yes,,...,No,Yes,,No,$11-30,Jintan,4.0,46,10009,10th St & 1st Ave


In [214]:
max_num = 5
urls_set = []
for i, loc in enumerate(searchLocations):
    # now run for loop with fix location and food type and append urls 
    # page ranking is based on relevance ranked by yelp
    for num in range(0,max_num):
        urls = get_urls_from_search("Chinese restaurants",loc, num)
        urls = urls[1:] # 0th link is irrelavant
        # len(urls)=0 if the starting page number exceed the maximum possible
        if (len(urls) ==0):
            break
        else:
            for i in range(0,len(urls)-1):
                urls_set.append(urls[i])
                
    time.sleep(5)
    #convert the urls from list to csv file for each location
    #pd_urls_set = pd.DataFrame(urls_set)
    #pd_urls_set.to_csv("urls_set_{0}.csv".format(loc))

In [215]:
len(urls_set)

1311

In [183]:
header=['restaurant_name', 'retaurant_address', 'restaurant_zipcode', 'restaurant_reviewcount', 'restaurant_rating', 'restaurant_neighobrhood', 'Hygiene_score', 'price_range', 'Liked by Vegetarians', 'Takes Reservations', 'Delivery', 'Take-out', 'Accepts Credit Cards', 'Accepts Bitcoin', 'Parking', 'Bike Parking', 'Wheelchair Accessible', 'Good for Kids', 'Good for Groups', 'Attire', 'Noise Level', 'Alcohol', 'Happy Hour', 'Outdoor Seating', 'Wi-Fi', 'Has TV', 'Dogs Allowed', 'Waiter Service', 'Caters', 'Category', 'Has Soy-free Options', 'Has Dairy-free Options', 'Liked by Vegans', 'Has Gluten-free Options', 'Good For', 'Ambience', 'Gender Neutral Restrooms']
info={}
for u in urls_set[:10]:
    url_dict=scrape(u)
    for i in header:
        if i in url_dict.keys():
            info.setdefault(i,[]).append(url_dict[i])
        else:
            info.setdefault(i,[]).append('NA')
     # Delays to help reduce queries and reduce the possibility of IP Ban
    time.sleep(2)

In [209]:
pd.DataFrame(info)

Unnamed: 0,Accepts Bitcoin,Accepts Credit Cards,Alcohol,Ambience,Attire,Bike Parking,Category,Caters,Delivery,Dogs Allowed,...,Waiter Service,Wheelchair Accessible,Wi-Fi,price_range,restaurant_name,restaurant_neighobrhood,restaurant_rating,restaurant_reviewcount,restaurant_zipcode,retaurant_address
0,No,Yes,No,,Casual,Yes,Chinese; Noodles;,No,Yes,No,...,No,No,Free,$11-30,Dian Kitchen,"East Village, Alphabet City",4.5,83,10009,435 E 9th St
1,,Yes,No,"Casual, Trendy",Casual,No,Chinese;,No,Yes,No,...,Yes,No,Free,$11-30,Clay Pot NYC,East Village,4.5,175,10003,58 St Mark's Pl
2,,Yes,Beer & Wine Only,Intimate,Casual,Yes,American (New); Chinese;,No,No,,...,Yes,,No,$31-60,Tuome,"East Village, Alphabet City",4.5,358,10009,536 E 5th St
3,No,Yes,,,,No,Tea Rooms; Dim Sum;,,,No,...,,Yes,Free,,Uluh,East Village,5.0,11,10003,152 2nd AveSte A
4,,Yes,Full Bar,Casual,Casual,Yes,Szechuan; Bars;,No,Yes,No,...,Yes,,No,$11-30,Han Dynasty,East Village,4.0,1067,10003,90 3rd Ave
5,No,Yes,Beer & Wine Only,Casual,Casual,Yes,Chinese; Noodles; Burgers;,No,No,No,...,No,No,No,Under $10,Xi’an Famous Foods,East Village,4.0,1532,10003,81 St Marks Pl
6,,Yes,No,Casual,,,Noodles; Chinese;,,No,,...,Yes,Yes,No,$31-60,Hunan Slurp,East Village,4.5,120,10009,112 1st Ave
7,,Yes,Beer & Wine Only,"Casual, Trendy",Casual,Yes,Chinese;,Yes,Yes,,...,Yes,Yes,No,$11-30,MáLà Project,East Village,4.0,473,10009,122 1st Ave
8,,No,No,Casual,Casual,Yes,Chinese;,No,No,,...,,,No,Under $10,Shu Jiao Fu Zhou Cuisine Restaurant,Lower East Side,4.5,580,10002,118 Eldridge St
9,No,Yes,No,,Casual,No,Chinese; Asian Fusion;,Yes,No,No,...,No,No,No,,The Dumpling Shop,East Village,4.5,52,10003,124 2nd Ave


In [None]:
print('Please enter a restaurant name below, if you have no idea about it, just click return')
name=input("restaurant name: ")

if name=='':
    #return the original csv
else:
    #return the selceted csv

In [None]:
print('Please enter a restaurant style below, if you have no idea about it, just click return')
style = input("style: ")

In [40]:
print('Are you going to eat Lunch, Dinner or Breakfast?, please input Lunch or Dinner, if you have no idea about it, just click return')
mealkind = input("Lunch, Dinner or Breakfast :")

Are you going to eat Lunch, Dinner or Breakfast?, please input Lunch or Dinner, if you have no idea about it, just click return
Lunch, Dinner or Breakfast :Lunch


In [41]:
print('Want to find a place to drink alcohol? Please enter: Full Bar, Beer&Wine Only or No below')
alcohol = input("alcohol:")

Want to find a place to drink alcohol? Please enter: Full Bar, Beer&Wine Only or No below
alcohol:Full Bar


In [210]:
print("enter your current location: ")
zipcode = input("location:")

enter your current location: 
location:Upper West Side


In [None]:
# We want to output the restaurants with highest Yelp ratings and good hygiene for the users