## A Restaurant Recommendation System based on Yelp Information ##

In [1]:
import re
from threading import Thread
import pandas as pd

**The following cell is the code(specifically the functions defined) for recommendation engine:**
**Note: The functions in the following cell are robust to invalid inputs!
(The program won't crash because of any non-sense input. Instead, it will keep asking for valid ones until valid ones are given.)**

In [2]:
# A function filtering out the rows of the input dataframe df whose column values of the restaurant_name's column
# match the user's input. 
# i.e A refined dataframe will be returned based on user's input about restaurant names.
def ask_for_name(df):
    
    # create a list to store the restaurant names that have match with user's input
    nl = []
    
    while True:
        
        # Ask for an input and store as a string
        print('Please enter a restaurant name below, if you have no idea about it, just click return\n')
        name=input("Restaurant name: \n")
        
        # If the name is empty, return the input dataframe as a whole. Nothing changed.
        if name == '':
            return df
        
        # iterate over all restaurant names 
        for item in df['restaurant_name'].tolist():
            # We only consider the names that are strings, ignore Nan
            if isinstance(item, str):
                # Add the names having match with user's input into the list
                if re.search(name.lower(), item.lower()):
                    nl.append(item)
        
        # If the list is empty at last (nothing is obtained for the input), 
        # ask user for the new input and run the whole thing again
        if len(nl) == 0:
            print('Please enter a valid name.\n')
            continue
        # Otherwise (some restaurants are filtered out based on the user's input), jump out of the loop
        else:
            break
    
    # Return the rows of the input data frame filtered out based on the user's input
    return df.loc[df['restaurant_name'].isin(nl)]

# A function filtering out the rows of the input dataframe df whose column values of the Source column
# match the user's input. 
# i.e A refined dataframe will be returned based on user's input about restaurant source(Chinese, Japanese).
# Same idea as the function ask_for_name
def ask_for_source(df):
    
    cl = []
    while True:
        print('Please enter a restaurant source below, if you have no idea about it, just click return\n')
        cat = input("Restaurant source: \n")
        
        if cat == '':
            return df
        
        for item in df['Source'].tolist():
            if isinstance(item, str):
                if re.search(cat.lower(), item.lower()):
                    cl.append(item)
        
        if len(cl) == 0:
            print('Please enter a valid source.\n')
            continue
        else:
            break
    return df.loc[df['Source'].isin(cl)]

# A function filtering out the rows of the input dataframe df whose column values of the restaurant_category's column
# match the user's input. 
# i.e A refined dataframe will be returned based on user's input about restaurant category.
# Same idea as the function ask_for_name
def ask_for_category(df):
    
    cl = []
    while True:
        print('Please enter a restaurant category below, if you have no idea about it, just click return\n')
        cat = input("Restaurant category: \n")
        
        if cat == '':
            return df
        
        for item in df['Category'].tolist():
            if isinstance(item, str):
                if re.search(cat.lower(), item.lower()):
                    cl.append(item)
        
        if len(cl) == 0:
            print('Please enter a valid category.\n')
            continue
        else:
            break
    return df.loc[df['Category'].isin(cl)]

# A function filtering out the rows of the input dataframe df whose column values of the restaurant_neighborhood's column
# match the user's input. 
# i.e A refined dataframe will be returned based on user's input about restaurant neighborhood.
# Same idea as the function ask_for_name
def ask_for_neighbor(df):
    
    nl = []
    while True:
        print('Please enter a restaurant neighborhood below, if you have no idea about it, just click return\n')
        neighbor = input("Restaurant neighborhood: \n")
        
        if neighbor == '':
            return df
        
        for item in df['restaurant_neighborhood'].tolist():
            if isinstance(item, str):
                if re.search(neighbor.lower(), item.lower()):
                    nl.append(item)
                
        if len(nl) == 0:
            print("Please enter a valid neighborhood. \n")
            continue
        else:
            break
    
    return df.loc[df['restaurant_neighborhood'].isin(nl)]

# A function filtering out the rows of the input dataframe df whose column values of the restaurant_rating's column
# fall into the interval [lb, ub] where lb(lower bound) and ub(upper bound) are given by the user's input.
# i.e A refined dataframe will be returned based on user's input about restaurant ratings.
def ask_for_rating(df):
    
    
    while True:
        try:
            print("Please enter a range of your expected rating below, if you have no idea about it, just click return\n")
            lb = input("Please enter a lower bound of rating range you are looking for :\n")
            ub = input("Please enter an upper bound of rating range you are looking for :\n")
        
            if lb == '' and ub == '':
                return df
        
            if lb == '':
                lb = '0'
        
            if ub == '':
                ub = '5'
            
            if float(lb) > float(ub):
                print('Lower bound has to be smaller than the upper bound.\n')
                continue
        
            if float(lb) < 0 or float(ub) > 5:
                print('The range of rating has to be between 0 and 5.\n')
                continue
            
            return df.loc[(df['restaurant_rating'] >= float(lb)) & (df['restaurant_rating'] <= float(ub))]
        
        except:
        
            print('Please provide a valid input\n')
            continue

# A function filtering out the rows of the input dataframe df based on whether or not the restaurants
# are good for parties/group activities.
def ask_for_party(df):
    
    while True:
        print("Good for groups / parties?\n")
        PARTY = input("Please enter Y/N\n")
        
        if PARTY == 'Y':
            A = 'Yes'
        
        elif PARTY =='N':
            A='No'
            
        else: 
            continue
        
        return df.loc[df['Good for Groups']==A]
    
# A function filtering out the rows of the input dataframe df whose column values(intervals more precisely) 
# of the price_range column have intersection the interval [lb, ub] where lb(lower bound) and ub(upper bound) 
# are given by the user's input.
# i.e A refined dataframe will be returned based on user's input about restaurant ratings.
def ask_for_price(df):
    
    pl = []
    while True:
        try:
            print("Please enter a range of your expected price below, if you have no idea about it, just click return\n")
            lb = input("Please enter a lower bound of price range you are looking for :\n")
            ub = input("Please enter an upper bound of price range you are looking for :\n")
            if lb == '' and ub == '':
                return df
        
            for item in df['price_range'].tolist():
                if isinstance(item, str):
                    if re.search('^\$\d+-\d+$', item):
                        i1 = int(item[1:item.find('-')])
                        i2 = int(item[item.find('-')+1:])
                    
                        if float(lb) <= i1 or float(ub) >= i2:
                            pl.append(item)
                    elif 'Under' in item:
                    
                        if int(item[item.find('$')+1:]) >= float(lb):
                            pl.append(item)
                        
                    elif 'Above' in item:
                    
                        if int(item[item.find('$')+1:]) <= float(ub):
                            pl.append(item)
                        
            if len(pl) == 0:
                print("Unfortunately, there are no restaurants falling into your desired price range. Please try some other price range\n")
                continue
            else:
                
                return df.loc[df['price_range'].isin(pl)]
    
        except:
        
            print('Please provide a valid input\n')
            continue

** Below is a demo for how to use the recommendation engine:**
We imported the whole dataframe (you will need the correct absolute path to the dataframe on your machine when you run the demo) and store it to the global variable global_df. Then the global dataframe stored by global_df will go through 7 rounds of filtration (based on name, source, category, neighborhood, rating, price range, good for party respectively). After each round of filtration, users can see the current number of restaurants filled out and decide whether or not to do further filtration. The global dataframe will be updated to be the one after the filtrations and displayed at last.

In [12]:
# rating, price range and good for party

# Note: The absolute path should be replaced by the correct absolute path 
# to the dataframe on your machine when you run the demo.
global_df = pd.read_csv('/Users/haodi_liu/Documents/GitHub/Are-you-hungry-team-13/final_df.csv', encoding='latin1')
done = False

while done == False:
    
    global_df = ask_for_name(global_df)
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far\n")
    con = input('Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
    if con == '0':
        break
    
    global_df = ask_for_source(global_df)
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far\n")
    con = input('Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
    if con == '0':
        break
        
    while True:
        global_df = ask_for_category(global_df)
        length = len(global_df['restaurant_name'].tolist())
        print(f"There are {length} restaurants filtered out so far\n")
        con = input('Do you want to keep filtering based on Category? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
        if con == '0':
            break
        
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far\n")
    con = input('Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
    if con == '0':
        break
        
    while True:
        global_df = ask_for_neighbor(global_df)
        length = len(global_df['restaurant_name'].tolist())
        print(f"There are {length} restaurants filtered out so far\n")
        con = input('Do you want to keep filtering based on Neighborhood? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
        if con == '0':
            break
    
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far\n")
    con = input('Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
    if con == '0':
        break
    
    global_df = ask_for_rating(global_df)
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far\n")
    con = input('Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
    if con == '0':
        break
    
    global_df = ask_for_price(global_df)
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far\n")
    con = input('Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
    if con == '0':
        break
        
    global_df = ask_for_party(global_df)
    length = len(global_df['restaurant_name'].tolist())
    print(f"There are {length} restaurants filtered out so far\n")
    con = input('Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.\n')
    if con == '0':
        break
        
    done = True
    
a4 = global_df
global_df

Please enter a restaurant name below, if you have no idea about it, just click return

Restaurant name: 

There are 2500 restaurants filtered out so far

Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.
a
Please enter a restaurant source below, if you have no idea about it, just click return

Restaurant source: 
American
There are 138 restaurants filtered out so far

Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.
a
Please enter a restaurant category below, if you have no idea about it, just click return

Restaurant category: 

There are 138 restaurants filtered out so far

Do you want to keep filtering based on Category? Enter 0 if you are done with searching. Otherwise, enter anything else.
0
There are 138 restaurants filtered out so far

Do you want to keep searching? Enter 0 if you are done with searching. Otherwise, enter anything else.
a
Please enter a restaurant neighborh

Unnamed: 0.1,Unnamed: 0,Accepts Bitcoin,Accepts Credit Cards,Alcohol,Ambience,Attire,Bike Parking,Category,Caters,Delivery,...,Waiter Service,Wheelchair Accessible,Wi-Fi,price_range,restaurant_name,restaurant_neighborhood,restaurant_rating,restaurant_reviewcount,restaurant_zipcode,retaurant_address
864,116,No,Yes,Beer & Wine Only,"Casual, Intimate",Casual,Yes,American (New); Bars; Breakfast & Brunch;,Yes,Yes,...,,,Free,$11-30,Brindle Room,"East Village, Alphabet City",4.0,938.0,10009.0,277 E 10th St
865,117,No,Yes,Beer & Wine Only,"Casual, Trendy",Casual,No,Korean; American (New); Tapas/Small Plates;,No,No,...,Yes,No,No,$11-30,Thursday Kitchen,"East Village, Alphabet City",4.5,1049.0,10009.0,424 E 9th St
866,118,,Yes,Full Bar,,,,Bars; American (Traditional); Breakfast & Brun...,,No,...,Yes,Yes,No,$11-30,FERNS,East Village,4.5,48.0,10009.0,166 1st Ave
867,119,No,Yes,Full Bar,Intimate,Casual,Yes,American (New);,Yes,Yes,...,Yes,No,Free,$31-60,Virginiaâ??s,"Alphabet City, East Village",4.0,151.0,10009.0,647 E 11th St
868,120,,Yes,Full Bar,Classy,Casual,No,American (New);,Yes,No,...,Yes,Yes,Free,$31-60,Bowery Road,"Union Square, Greenwich Village, East Village",4.5,137.0,10003.0,132 4th Ave
870,122,,Yes,Full Bar,Casual,Casual,Yes,Gastropubs; American (Traditional);,No,Yes,...,,,Free,$11-30,Cooperâ??s Craft & Kitchen,East Village,4.0,401.0,10003.0,87 2nd Ave
872,124,No,Yes,Full Bar,,Casual,Yes,American (New); Italian; Breakfast & Brunch;,No,Yes,...,,,No,$31-60,Hearth,East Village,4.0,662.0,10009.0,403 E 12th St
873,125,,Yes,Full Bar,Casual,Casual,Yes,American (Traditional); Cocktail Bars;,No,No,...,Yes,Yes,Free,$11-30,Sister Jane NYC,East Village,4.5,36.0,10003.0,349 E 13th St
877,129,,Yes,Full Bar,Trendy,Casual,Yes,American (New);,No,No,...,Yes,Yes,No,$31-60,Narcissa,East Village,4.0,466.0,10003.0,25 Cooper Sq
878,130,,Yes,Beer & Wine Only,Casual,Casual,Yes,Ethiopian; Bars;,Yes,Yes,...,Yes,,No,$11-30,Haile Bistro,"East Village, Alphabet City",4.0,172.0,10009.0,182 Ave B


In [10]:
#this function output the recommendation when the amount of restaurant given above is smaller or equal to 5.
def get_out_put_1(d,f):
    if len(d.iloc[i,37])>5:
        er1=d.iloc[i,37]
        er2=er1[0:5]
    else:
        er2=d.iloc[i,37]
    print(f'''\033[1;31m ###Recommendation{i+1}###  \033[0m''')
    print(f'''restaurant name:                {d.iloc[i,33]}''')
    print(f'''restaurant address:             {d.iloc[0,-1]} New York, NY {er2}''')
    print(f'''price range:                    {d.iloc[i,32]}''') 
    print(f'''restaurant rating:              {d.iloc[i,35]}''')
    print(f'''restaurant hygiene rate:        {d.iloc[i,20]}''')
    print(f'''restaurant attire:              {d.iloc[i,5]} ''' )
    print(f'''restaurant parking:             {d.iloc[i,25]}''')
    print(f'''restaurant reservable:          {d.iloc[i,27]}''')
    print(f'''restaurant ambience:            {d.iloc[i,4]}''')

In [11]:
#this function output the recommendation when the amount of restaurant given above is larger than 5.
def get_out_put_2(d,c,g):
    if len(d.iloc[a[i],37])>5:
        er1=d.iloc[a[i],37]
        er2=er1[0:5]
    else:
        er2=d.iloc[a[i],37]
    print(f'''\033[1;31m ###Recommendation{i+1}###  \033[0m''')
    print(f'''restaurant name:                {d.iloc[a[i],33]}''')
    print(f'''restaurant address:             {d.iloc[a[i],-1]} New York, NY {er2} ''')
    print(f'''price range:                    {d.iloc[a[i],32]}''')
    print(f'''restaurant rating:              {d.iloc[a[i],35]}''')
    print(f'''restaurant hygiene rate:        {d.iloc[a[i],20]}''')
    print(f'''restaurant attire:              {d.iloc[a[i],5]}''' )
    print(f'''restaurant ambience:            {d.iloc[a[i],4]}''')
    print(f'''restaurant parking:             {d.iloc[a[i],25]}''')
    print(f'''restaurant reservable:          {d.iloc[a[i],27]}''')

Note: The following cell serving to offer the top recommendations works well for vast majority of cases. However, there are a few extreme cases. Only numerical values can be compared when we used the sort() function to sort the filtered restaurants and provide the top rated restaurants(at most 5) as recommendations. In fact, tiny amount datas in the column of "restaurant_rating" are not numerical because of some possible perturbations during web scrapping, which is very normal. Thus, sort() function can't execute on the "restaurant_rating" column when non-numerical values coexist with numerical values. As a result, recommendations can't be offered when we confront this rare issue. Under this circumstance, you can still refer to the returned raw panda dataframe before, which is absolutely reliable.

In [12]:
if a4.shape[0]<=5: #when the amount of the restaurant given above is smaller or equal to 5
    for i in range(a4.shape[0]): # run a loop to give the recommendation
        get_out_put_1(a4,i)
        print('\n')
elif a4.shape[0]>5: #when the amount of the restaurant given above is smaller or equal to 5
    a=[]
    list_1=[]
    for q in range(a4.shape[0]): # put the restaurant rating items in a list
        list_1.append(a4.iloc[q,34])
    list_1s=sorted(list_1, reverse=True)# sort the list from largest to smallest
    for i in range(len(list_1)):# run five loops to record the position of the rating
        if list_1s[0]==list_1[i]:# find the position of the largest rating and keep it in list a
            a.append(i)
            break
    list_1[a[0]]=-1
    for i in range(len(list_1)):# find the position of the second largest rating and keep it in list a
        if list_1s[1]==list_1[i]:
            a.append(i)
            break
    list_1[a[1]]=-1
    for i in range(len(list_1)):
        if list_1s[2]==list_1[i]:
            a.append(i)
            break
    list_1[a[2]]=-1
    for i in range(len(list_1)):
        if list_1s[3]==list_1[i]:
            a.append(i)
            break
    list_1[a[3]]=-1
    for i in range(len(list_1)):
        if list_1s[4]==list_1[i]:
            a.append(i)
            break
    list_1[a[4]]=-1
    for i in range(len(a)):# run a loop to give the recommendation
        get_out_put_2(a4,i,a)
        print('\n')
elif a4.shape[0]<=0:
    print("Please search again, this time we do not have a restaurant satisfy your need. ")


[1;31m ###Recommendation1###  [0m
restaurant name:                Noodle Village
restaurant address:             13 Mott St New York, NY 10013
price range:                    Under $10
restaurant rating:              4.0
restaurant hygiene rate:        A
restaurant attire:              Casual 
restaurant parking:             Street
restaurant reservable:          Yes
restaurant ambience:            Casual


[1;31m ###Recommendation2###  [0m
restaurant name:                Hop Lee Restaurant
restaurant address:             13 Mott St New York, NY 10013
price range:                    $11-30
restaurant rating:              4.0
restaurant hygiene rate:        B
restaurant attire:              Casual 
restaurant parking:             Street
restaurant reservable:          Yes
restaurant ambience:            Casual


[1;31m ###Recommendation3###  [0m
restaurant name:                88 Lan Zhou Handmade Noodles
restaurant address:             13 Mott St New York, NY 10013
price range:  