## A Restaurant Recommendation System based on Yelp Information ##

**Project Goal**: Create an NYC restaurant information database and use the database to recommend users restaurants which match with their preferences.

Users can input features that indicate their preferences and our machine can output top 5 restaurants from as our recommendation for the user.

**Steps**:
here are the key features we scrape from the Yelp website:
1. Restaurant name
2. Restaurant rating (yelp rating 0-5 with 0.5 increment)
3. Hygiene (official scores: A, B, C)
4. Restaurant neighborhood (Morningside height, East Village, Chelsea, etc.)
5. Category (i.e. cuisine type: Chinese, Japanese, French, American, etc.)
6. Ambience (Quiet, Noisy)
7. Price range

We use "Yelp Restaurant Info Scraper.ipynb" to get search urls and scrape restaurant information for restaurants in different categories in all locations in NYC. We ran the script on our local machines.

After getting the csv for each type of restaurant, we use "Dataframe Cleaner.ipynb" to clean and merge all the csv files as a final dataframe. We use this final dataframe as our database.

In [1]:
from bs4 import BeautifulSoup
import re
from threading import Thread
import urllib
import pandas as pd
import urllib.request
import time
from random import randint

In [None]:
opener = urllib.request.build_opener()
# IE 9 proved to be the most successful
opener.addheaders = [('User-agent', 'IE 9/Windows: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)')]
urllib.request.install_opener(opener)

In [None]:
# Function that will do the scraping job from yelp
def scrape(ur):

    with urllib.request.urlopen(ur) as url:
        html = url.read()
    soup = BeautifulSoup(html,"lxml")
    retaurant_name = soup.find('h1')
    
    # create a dictionary business info for storing key business features 
    business_info = {}
    business_info['restaurant_name']= str(retaurant_name.text.strip().rstrip())
    
    if soup.find('span',itemprop="streetAddress") != None:
        retaurant_address = soup.find('span',itemprop="streetAddress")
        business_info['retaurant_address'] = str(retaurant_address.text.strip().rstrip())
    
    if soup.find('span',itemprop="postalCode") != None:
        restaurant_zipcode = soup.find('span',itemprop="postalCode")
        business_info['restaurant_zipcode'] = str(restaurant_zipcode.text.strip().rstrip())
    
    if soup.find('span',itemprop="reviewCount") != None:
        restaurant_reviewcount = soup.find('span',itemprop="reviewCount")
        business_info['restaurant_reviewcount'] = str(restaurant_reviewcount.text.strip().rstrip())
   
    if soup.find(itemprop="ratingValue") != None:
        business_info['restaurant_rating'] = soup.find(itemprop="ratingValue").get("content")

    if soup.find('span', {'class': 'neighborhood-str-list'}) != None:
        neighborhood = soup.find('span', {'class': 'neighborhood-str-list'})
        business_info['restaurant_neighobrhood'] = str(neighborhood.text.strip().rstrip())
   
    if soup.find('dd',{'class':"nowrap health-score-description"}) != None:
        hygiene_score = soup.find('dd',{'class':"nowrap health-score-description"})
        business_info['Hygiene_score'] = str(hygiene_score.text.strip().rstrip())
        
    if soup.find('dd', {'class':"nowrap price-description"}) != None:
        price_range = soup.find('dd', {'class':"nowrap price-description"})
        business_info['price_range'] = str(price_range.text.strip().rstrip())
   
    if soup.find('div',{'class':'short-def-list'}) != None:
        for i in soup.find('div',{'class':'short-def-list'}).findAll('dl'):
            key = i.find('dt').text.strip().rstrip()
            value = i.find('dd').text.strip().rstrip()
            business_info[str(key)]=str(value)
    
    if soup.find(property="place:location:latitude") != None:
        business_info['latitude'] = soup.find(property="place:location:latitude").get("content")

    if soup.find(property="place:location:longitude") != None:
        business_info['longitude'] = soup.find(property="place:location:longitude").get("content")  
    
    business_info['Category']= ''
    if soup.find('span',{'class':'category-str-list'}) != None:
        for i in soup.find('span',{'class':'category-str-list'}).findAll('a'):
            business_info['Category'] += (str(i.text.strip().rstrip())+'; ')
                
    return business_info

In [None]:
# Example for the scrape function
d = scrape('https://www.yelp.com/biz/blue-ribbon-sushi-new-york?osq=blue+ribbon')
print(d)

In [None]:
# List of yelp urls to scrape
def get_urls_from_search(term, location, num):
    
    term = term.replace(' ','+')
    location = location.replace(' ','+')
    query = 'https://www.yelp.com/search?find_desc='+term+'&find_loc='+location+'&start='+str(num*10)
    with urllib.request.urlopen(query) as url:
        contents = url.read()
    #contents = urllib.urlopen(query).read()
    soup = BeautifulSoup(contents, "html.parser")
    #print(soup)
    business_url = []
    for result in soup.findAll('a',{'class':'biz-name js-analytics-click'}):
        business_url.append("http://www.yelp.com" + result['href'])
    return business_url


In [None]:
# List of all locations
# 'Alphabet_City','Battery_Park','Chelsea','Chinatown','Civic_Center','East_Harlem','East_Village','Financial_District','Flatiron','Gramercy','Greenwich_Village','Harlem','Hell\'s_Kitchen','Inwood','Kips_Bay','Koreatown','Little_Italy','Lower_East_Side','Manhattan_Valley','Marble_Hill','Meatpacking_District','Midtown_East','Midtown_West','Morningside_Heights','Murray_Hill','NoHo','Nolita','Roosevelt_Island','SoHo','South_Street_Seaport','South_Village','Stuyvesant_Town','Theater_District','TriBeCa','Two_Bridges','Union_Square','Upper_East_Side','Upper_West_Side','Washington_Heights','West_Village', 'Yorkville'

# Yorkville entirely encompassed by Upper East Side
# Theater District entirely encompassed by Midtown West

searchLocations = ['Alphabet_City','Battery_Park','Chelsea','Chinatown','Civic_Center','East_Harlem','East_Village','Financial_District','Flatiron','Gramercy','Greenwich_Village','Harlem','Hell\'s_Kitchen','Inwood','Kips_Bay','Koreatown','Little_Italy','Lower_East_Side','Manhattan_Valley','Marble_Hill','Meatpacking_District','Midtown_East','Midtown_West','Morningside_Heights','Murray_Hill','NoHo','Nolita','Roosevelt_Island','SoHo','South_Street_Seaport','South_Village','Stuyvesant_Town','TriBeCa','Two_Bridges','Union_Square','Upper_East_Side','Upper_West_Side','Washington_Heights','West_Village']

#test
#searchLocations = ['East_Village', 'Upper_West_Side', 'Chelsea'] 

In [None]:
max_num = 5
urls_set = []
for i, loc in enumerate(searchLocations):
    # now run for loop with fix location and food type and append urls 
    # page ranking is based on relevance ranked by yelp
    for num in range(0,max_num):
        urls = get_urls_from_search("Japanese restaurants",loc, num)
        urls = urls[1:] # 0th link is irrelavant
        # len(urls)=0 if the starting page number exceed the maximum possible
        if (len(urls) ==0):
            break
        else:
            for i in range(0,len(urls)-1):
                urls_set.append(urls[i])
                
    # Delays to help reduce queries and reduce the possibility of IP Ban            
    time.sleep(5)
    #convert the urls from list to csv file for each location
    #pd_urls_set = pd.DataFrame(urls_set)
    #pd_urls_set.to_csv("urls_set_{0}.csv".format(loc))

In [None]:
print(len(urls_set))
urls_set

In [None]:
info = {} # create a dictionary for srestaurant info
for u in urls_set:
    url_dict=scrape(u)
    for key,value in url_dict.items():
        info.setdefault(key,[]).append(value)

In [None]:
#header

In [None]:
header=['restaurant_name', 'retaurant_address', 'restaurant_zipcode', 'restaurant_reviewcount', 'restaurant_rating', 'Hygiene_score', 'price_range', 'Takes Reservations', 'Delivery', 'Take-out', 'Accepts Credit Cards', 'Accepts Apple Pay', 'Accepts Google Pay', 'Accepts Bitcoin', 'Good For', 'Parking', 'Bike Parking', 'Wheelchair Accessible', 'Good for Kids', 'Good for Groups', 'Attire', 'Ambience', 'Noise Level', 'Alcohol', 'Happy Hour', 'Outdoor Seating', 'Wi-Fi', 'Has TV', 'Dogs Allowed', 'Waiter Service', 'Caters', 'Gender Neutral Restrooms', 'Has Dairy-free Options', 'Liked by Vegetarians', 'Liked by Vegans', 'Good For Dancing', 'Best Nights', 'Good for Working', 'Has Pool Table', 'Open to All', 'Coat Check', 'Has Soy-free Options', 'Has Gluten-free Options', 'Smoking', 'Has Kosher Options', 'Offers Military Discount']
info={}
for u in urls_set:
    url_dict=scrape(u)
    for i in header:
        if i in url_dict.keys():
            info.setdefault(i,[]).append(url_dict[i])
        else:
            info.setdefault(i,[]).append('NA')
    # Delays to help reduce queries and reduce the possibility of IP Ban
    time.sleep(5)

In [None]:
final_df_sample=pd.DataFrame(info)
#final_df_sample

In [None]:
final_df_sample.to_csv('sample japnese.csv') # save to csv

In [None]:
len(final_df)
final_df.head()

In [None]:
max_num = 5
urls_set = []
for i, loc in enumerate(searchLocations):
    # now run for loop with fix location and food type and append urls 
    # page ranking is based on relevance ranked by yelp
    for num in range(0,max_num):
        urls = get_urls_from_search("Chinese restaurants",loc, num)
        urls = urls[1:] # 0th link is irrelavant
        # len(urls)=0 if the starting page number exceed the maximum possible
        if (len(urls) ==0):
            break
        else:
            for i in range(0,len(urls)-1):
                urls_set.append(urls[i])
                
    time.sleep(5)
    #convert the urls from list to csv file for each location
    #pd_urls_set = pd.DataFrame(urls_set)
    #pd_urls_set.to_csv("urls_set_{0}.csv".format(loc))

In [None]:
len(urls_set)

In [None]:
header=['restaurant_name', 'retaurant_address', 'restaurant_zipcode', 'restaurant_reviewcount', 'restaurant_rating', 'restaurant_neighobrhood', 'Hygiene_score', 'price_range', 'Liked by Vegetarians', 'Takes Reservations', 'Delivery', 'Take-out', 'Accepts Credit Cards', 'Accepts Bitcoin', 'Parking', 'Bike Parking', 'Wheelchair Accessible', 'Good for Kids', 'Good for Groups', 'Attire', 'Noise Level', 'Alcohol', 'Happy Hour', 'Outdoor Seating', 'Wi-Fi', 'Has TV', 'Dogs Allowed', 'Waiter Service', 'Caters', 'Category', 'Has Soy-free Options', 'Has Dairy-free Options', 'Liked by Vegans', 'Has Gluten-free Options', 'Good For', 'Ambience', 'Gender Neutral Restrooms']
info={}
for u in urls_set[:30]:
    url_dict=scrape(u)
    for i in header:
        if i in url_dict.keys():
            info.setdefault(i,[]).append(url_dict[i])
        else:
            info.setdefault(i,[]).append('NA')
     # Delays to help reduce queries and reduce the possibility of IP Ban
    time.sleep(2)

In [None]:
df = pd.DataFrame(info)
df.to_csv('sample chinese.csv') 

In [None]:
df.head()

In [None]:
print('Please enter a restaurant name below, if you have no idea about it, just click return')
name=input("restaurant name: ")

if name=='':
    #return the original csv
else:
    #return the selceted csv

In [None]:
print('Please enter a restaurant style below, if you have no idea about it, just click return')
style = input("style: ")

In [None]:
print('Are you going to eat Lunch, Dinner or Breakfast?, please input Lunch or Dinner, if you have no idea about it, just click return')
mealkind = input("Lunch, Dinner or Breakfast :")

In [None]:
print('Want to find a place to drink alcohol? Please enter: Full Bar, Beer&Wine Only or No below')
alcohol = input("alcohol:")

In [None]:
print("enter your current location: ")
zipcode = input("location:")

In [None]:
# We want to output the restaurants with highest Yelp ratings and good hygiene for the users

In [2]:
hehe = pd.read_csv('/Users/haodi_liu/Desktop/final_df.csv', encoding='latin1')


hehe


Unnamed: 0.1,Unnamed: 0,Accepts Bitcoin,Accepts Credit Cards,Alcohol,Ambience,Attire,Bike Parking,Category,Caters,Delivery,...,Wheelchair Accessible,Wi-Fi,price_range,restaurant_name,restaurant_neighobrhood,restaurant_rating,restaurant_reviewcount,restaurant_zipcode,retaurant_address,Source
0,0,,No,Beer & Wine Only,Casual,Casual,No,Shanghainese; Seafood; Venues & Event Spaces;,No,No,...,,No,$11-30,Joe?s Shanghai,"Chinatown, Civic Center",4.0,5508.0,10013,9 Pell St,Chinese
1,1,,Yes,No,Casual,Casual,Yes,Shanghainese;,No,Yes,...,,No,$11-30,Shanghai 21,"Chinatown, Civic Center",4.0,1437.0,10013,21 Mott St,Chinese
2,2,,Yes,No,Casual,Casual,No,Chinese; Noodles;,No,Yes,...,,No,Under $10,Noodle Village,"Chinatown, Civic Center",4.0,877.0,10013,13 Mott St,Chinese
3,3,,Yes,Full Bar,,,Yes,Dim Sum; Seafood; Sandwiches;,Yes,Yes,...,Yes,Free,,Brooklyn Chop House,"Civic Center, Financial District",4.0,78.0,10038,150 Nassau St,Chinese
4,4,,Yes,Beer & Wine Only,Casual,Casual,Yes,Chinese; Seafood; Noodles;,No,Yes,...,,No,$11-30,Hop Lee Restaurant,"Chinatown, Civic Center",4.0,256.0,10013,16 Mott St,Chinese
5,5,,No,Beer & Wine Only,Casual,Casual,Yes,Seafood; Cantonese;,No,No,...,,No,$11-30,Hop Kee,"Chinatown, Civic Center",3.5,741.0,10013,21 Mott St,Chinese
6,6,,No,No,Casual,Casual,Yes,Chinese; Noodles;,No,Yes,...,,No,Under $10,Tasty Hand-Pulled Noodles,"Chinatown, Civic Center",4.0,1450.0,10013,1 Doyers St,Chinese
7,7,,No,Beer & Wine Only,Casual,Casual,Yes,Chinese;,No,No,...,No,No,$11-30,Wo Hop,"Chinatown, Civic Center",3.5,1259.0,10013,17 Mott St,Chinese
8,8,No,Yes,No,Casual,Casual,No,Chinese; Kosher; Vegan;,Yes,Yes,...,Yes,No,$11-30,Buddha Bodai Kosher Vegetarian Restaurant,"Chinatown, Civic Center",4.0,881.0,10013,5 Mott St,Chinese
9,9,,No,No,Casual,Casual,Yes,Chinese;,No,No,...,,No,Under $10,Shu Jiao Fu Zhou Cuisine Restaurant,Lower East Side,4.5,580.0,10002,118 Eldridge St,Chinese


In [5]:
'''
def ask_for_name(df):
    
    while True:
        print('Please enter a restaurant name below, if you have no idea about it, just click return')
        name=input("Restaurant name: ")
        #df = pd.read_csv(ab_path, encoding = 'gbk')
    
    
        if name == '':
            return df
        elif name not in df['restaurant_name'].tolist():
            print('Please enter a valid name.')
            continue
        else:
            
            return df.loc[df['restaurant_name'].isin([name])] 
'''

def ask_for_name(df):
    
    nl = []
    while True:
        print('Please enter a restaurant name below, if you have no idea about it, just click return')
        name=input("Restaurant name: ")
        
        if name == '':
            return df
        
        for item in df['restaurant_name'].tolist():
            if re.search(name.lower(), item.lower()):
                nl.append(item)
        
        if len(nl) == 0:
            print('Please enter a valid name.')
            continue
        else:
            break
    return df.loc[df['restaurant_name'].isin(nl)]

def ask_for_neighbor(df):
    
    nl = []
    while True:
        print('Please enter a restaurant neighborhood below, if you have no idea about it, just click return')
        neighbor = input("Restaurant neighborhood: ")
        
        if neighbor == '':
            return df
        elif neighbor not in df['restaurant_neighobrhood'].tolist():
            print('Please enter a valid neighbor name.')
            continue
        else:
            nl.append(neighbor)
            if_con = input("Do you want to keep adding? Enter 1 if you do. ")
            if if_con != '1':
                break
    print(nl)
    return df.loc[df['restaurant_neighobrhood'].isin(nl)]

def ask_for_rating(df):
    
    while True:
        print("Please enter a range of your expected rating below, if you have no idea about it, just click return")
        lb = input("Please enter a lower bound of rating range you are looking for :")
        ub = input("Please enter an upper bound of rating range you are looking for :")
        
        if lb == '' and ub == '':
            return df
        
        if lb == '':
            lb = '0'
        
        if ub == '':
            ub = '5'
            
        if float(lb) > float(ub):
            print('Lower bound has to be smaller than the upper bound.')
            continue
        
        if float(lb) < 0 or float(ub) > 5:
            print('The range of rating has to be between 0 and 5.')
            continue
            
        return df.loc[(df['restaurant_rating'] >= float(lb)) & (df['restaurant_rating'] <= float(ub))]
        

In [None]:
ask_for_name(pd.read_csv('/Users/haodi_liu/Desktop/final_df.csv', encoding='latin1'))

In [None]:
global_df = pd.read_csv('/Users/haodi_liu/Desktop/final_df.csv', encoding='latin1')
#original_name = global_df['restaurant_name'].tolist()
#global_df = ask_for_name(global_df)
done = False
'''
if global_df['restaurant_name'].tolist() == original_name:
    
    print("Let's keep going.")
    #done = False   
else:

    print('You got what you want.')
    done = True 
'''

while done == False:
    
    global_df = ask_for_name(global_df)
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far")
    con = input('Do you want to keep searching? Enter 0 if you are done with searching')
    if con == '0':
        break
    
    global_df = ask_for_neighbor(global_df)
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far")
    con1 = input('Do you want to keep searching? Enter 0 if you are done with searching')
    if con1 == '0':
        break
    
    global_df = ask_for_rating(global_df)
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far")
    con2 = input('Do you want to keep searching? Enter 0 if you are done with searching')
    if con2 == '0':
        break
        
    done = True
    

global_df

Please enter a restaurant name below, if you have no idea about it, just click return
Restaurant name: 
There are 1042 restaurants filtered out so far
Do you want to keep searching? Enter 0 if you are done with searchingasasa
Please enter a restaurant neighborhood below, if you have no idea about it, just click return
Restaurant neighborhood: Lower East Side


In [None]:
import pandas as pd

In [None]:
hehe = pd.read_csv('/Users/houfei/Desktop/pro/chinese.csv', encoding='gbk')
hehe

In [None]:
flag=1

In [None]:
b=hehe

In [None]:
def ask_for_name(a):
    global flag
    while True:
        print('Please enter a restaurant name below, if you have no idea about it, just click return')
        name=input("restaurant name: ")
        df = a
    
    
        if name == '':
            return df
            break
        elif name not in df['restaurant_name'].tolist():
            print('Please enter a valid name.')
            continue
        else:
            flag=2
            return df.loc[df['restaurant_name'].isin([name])]
            break

In [None]:
ask_for_name(hehe)#

In [None]:
def ask_for_Category(a):
    global flag
    global b
    if flag==1:
        print("Let's keep going.")
        while True:
            print('Please enter a restaurant category below, like Chinese, American etc')
            print('if you have no idea about it, just click return')
            category=input("restaurant category: ")
            df = a
    
    
            if category == '':
                return df
                break
            elif category not in df['Category'].tolist():
                print('Please enter a valid category.')
                continue
            else:
                flag=3
                b=df.loc[df['Category'].isin([category])]
                return df.loc[df['Category'].isin([category])]
                break
    elif flag==2:
            print('You\' ve got what you want')

In [None]:
ask_for_Category(hehe)

In [None]:
def ask_for_Acohol(a):
    global flag
    global b
    if flag==3:
        while True:
            print('Want to find a place to drink alcohol? Please enter: Full Bar, Beer&Wine Only or No below')
            alcohol = input("alcohol:")
            print('if you have no idea about it, just click return')
            df = b
    
    
            if alcohol == '':
                return b
                break
            elif alcohol not in df['Alcohol'].tolist():
                print('Please enter a valid category.')
                continue
            else:
                flag=3
                b=df.loc[df['Alcohol'].isin([alcohol])]
                return df.loc[df['Alcohol'].isin([alcohol])]
                break
    elif flag==2:
        print("Thank you for working with us")

In [None]:
ask_for_Acohol(b)

In [None]:
import pandas as pd
hehe = pd.read_csv('/Users/houfei/Desktop/pro/chinese.csv', encoding='gbk')
b=hehe
def ask_for_name(a):
    global flag
    flag=1
    global b
    global a1
    while True:
        print('Please enter a restaurant name below, if you have no idea about it, just click return')
        name=input("restaurant name: ")
        df = b
    
    
        if name == '':
            print("Let's do next step")
            break
        #elif name not in df['restaurant_name'].tolist():
            #print('Please enter a valid name.')
            #continue
        elif name != '':
            flag=2
            print("you've got what you want")
            a1=hehe.loc[hehe['restaurant_name'].isin([''])] 
                #category='Chinese'
            pattern=name
            for item in hehe['restaurant_name'].tolist():
                #print(item)
                patternl=pattern.lower()
                iteml=item.lower()
                match = re.search(patternl, iteml)
                if bool(match)==True:
                    a2=hehe.loc[hehe['restaurant_name'].isin([item])]
                    a1=pd.concat([a1,a2])
                    b=a1
            return a1
            break
        elif name not in df['restaurant_name'].tolist():
            print('Please enter a valid name.')
            continue
            return df.loc[df['restaurant_name'].isin([name])]
            break

ask_for_name(hehe)#Wo Hop


def ask_for_Category(a):
    global flag
    global b
    if flag==1:
        while True:
            print('Please enter a restaurant category below, like Chinese, American etc')
            print('if you have no idea about it, just click return')
            category=input("restaurant category: ")
            df = b
    
    
            if category == '':
                print("Let's do next step")
                break
            
            elif category!='':
                t1=hehe.loc[hehe['Category'].isin([''])] 
                pattern=category
                for item in hehe['Category'].tolist():
                    patternlc=pattern.lower()
                    itemlc=item.lower()
                    match = re.search(patternlc, itemlc)
                    if bool(match)==True:
                        t2=hehe.loc[hehe['Category'].isin([item])]
                        t1=pd.concat([t1,t2])
                        b=t1
                return t1
                break
            elif category not in df['Category'].tolist():
                print('Please enter a valid category.')
                continue
            
ask_for_Category(hehe)#Chinese;

def ask_for_Acohol(a):
    global flag
    global b
    if flag==1:
        while True:
            print('Want to find a place to drink alcohol? Please enter: Yes or No below')
            alcohol = input("alcohol:")
            print('if you have no idea about it, just click return')
            df = b
    
    
            if alcohol == '':
                return b
                break
            elif alcohol =='No' or alcohol =='no':
                d1= df.loc[df['Alcohol'].isin(['No'])]
                d2=df.loc[df['Alcohol'].isin(['NA'])]
                d3=pd.concat([d1,d2])
                b=d3
                return d3
                break
            elif alcohol =='Yes' or alcohol =='yes':
                flag=3
                c1=hehe.loc[hehe['Alcohol'].isin([''])] 
                #category='Chinese'
                pattern1='NA'
                pattern2='No'
                for item in hehe['Alcohol'].tolist():
                #print(item)
                    match1 = re.search(pattern1,item)
                    match2 = re.search(pattern2,item)
                    if bool(match1)==True or bool(match2)==False:
                        c2=hehe.loc[hehe['Alcohol'].isin([item])]
                        c1=pd.concat([c1,c2])
                        b=c1
                return c1
                break
                b=df.loc[df['Alcohol'].isin([alcohol])]
                return df.loc[df['Alcohol'].isin([alcohol])]
                break
            elif alcohol not in df['Alcohol'].tolist():
                print('Please enter a valid category.')
                continue
    elif flag==2:
        return a1
        
ask_for_Acohol(b)