In [1]:
import pandas as pd
import numpy as np
import random


In [2]:
dish_df = pd.read_csv("Dish.csv")
dish_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422039 entries, 0 to 422038
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              422039 non-null  int64  
 1   name            422039 non-null  object 
 2   description     0 non-null       float64
 3   menus_appeared  422039 non-null  int64  
 4   times_appeared  422039 non-null  int64  
 5   first_appeared  422039 non-null  int64  
 6   last_appeared   422039 non-null  int64  
 7   lowest_price    392939 non-null  float64
 8   highest_price   392939 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 29.0+ MB


In [6]:
dishes_list = dish_df["name"].tolist()

In [8]:
dishes_list[10]

'Striped bass saute, meuniere'

In [16]:
data = []

In [17]:

for x in range(200):
    rand_dishes_length = random.randint(50, 75) 
    random_dish_list = random.choices(dishes_list, k = rand_dishes_length)
    data.append([x,' '.join(map(str, random_dish_list))])

In [18]:
resto_df = pd.DataFrame(data, columns = ['resto_id', 'dishes']) 

In [21]:
resto_df.to_csv("resto_df.csv")

In [201]:
resto_df = pd.read_csv("resto_df.csv")

In [202]:
resto_df.shape

(200, 3)

In [203]:
resto_df.head()

Unnamed: 0.1,Unnamed: 0,resto_id,dishes
0,0,0,Mirabelle Leon Beyer Masthuhnbrust Singapur in...
1,1,1,Piel Bro's. Lager Scrambled Eggs and Onions FL...
2,2,2,"3 fried eggs and ham or bacon, bread With Pota..."
3,3,3,Braised free range chicken fresh polenta Beef ...
4,4,4,Spring duckling Cream or Country Gravy Oyster ...


## Normalizing the dishes

In [204]:
resto_df["dishes"] = resto_df["dishes"].str.lower()

In [205]:
resto_df.head()

Unnamed: 0.1,Unnamed: 0,resto_id,dishes
0,0,0,mirabelle leon beyer masthuhnbrust singapur in...
1,1,1,piel bro's. lager scrambled eggs and onions fl...
2,2,2,"3 fried eggs and ham or bacon, bread with pota..."
3,3,3,braised free range chicken fresh polenta beef ...
4,4,4,spring duckling cream or country gravy oyster ...


In [206]:
import nltk

In [207]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [208]:
resto_df["dishes"] = resto_df["dishes"].map(lambda x: nltk.word_tokenize(x))

In [209]:
porter = nltk.PorterStemmer()

In [210]:
lancaster = nltk.LancasterStemmer()

In [211]:
wnl = nltk.WordNetLemmatizer()

In [212]:
resto_df["dishes"] = resto_df["dishes"].map(lambda x: [porter.stem(t) for t in x])

In [213]:
resto_df.head()

Unnamed: 0.1,Unnamed: 0,resto_id,dishes
0,0,0,"[mirabel, leon, beyer, masthuhnbrust, singapur..."
1,1,1,"[piel, bro, 's, ., lager, scrambl, egg, and, o..."
2,2,2,"[3, fri, egg, and, ham, or, bacon, ,, bread, w..."
3,3,3,"[brais, free, rang, chicken, fresh, polenta, b..."
4,4,4,"[spring, duckl, cream, or, countri, gravi, oys..."


In [214]:
resto_df["dishes"] = resto_df["dishes"].map(lambda x: ' '.join(map(str, x)))

In [215]:
resto_df.head()

Unnamed: 0.1,Unnamed: 0,resto_id,dishes
0,0,0,mirabel leon beyer masthuhnbrust singapur in c...
1,1,1,piel bro 's . lager scrambl egg and onion flam...
2,2,2,"3 fri egg and ham or bacon , bread with potato..."
3,3,3,brais free rang chicken fresh polenta beef gou...
4,4,4,spring duckl cream or countri gravi oyster or ...


In [216]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

In [217]:
tf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tf.fit_transform(resto_df['dishes'])

In [222]:
tfidf_matrix.shape

(200, 8770)

In [218]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) 
results = {}
for idx, row in resto_df.iterrows():
   similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
   similar_items = [(cosine_similarities[idx][i], resto_df['resto_id'][i]) for i in similar_indices] 
   results[row['resto_id']] = similar_items[1:]

In [219]:
def item(id):  
  return resto_df.loc[resto_df['resto_id'] == id]['resto_id'].tolist()[0]


In [220]:
def recommend(item_id, num):
    print("Recommending " + str(num) + " restaurants similar to " + str(item(item_id)) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + str(item(rec[1])) + " (score:" + str(rec[0]) + ")")

In [221]:
recommend(item_id=61, num=5)

Recommending 5 restaurants similar to 61...
-------
Recommended: 30 (score:0.20605073511349403)
Recommended: 13 (score:0.1780949470546816)
Recommended: 103 (score:0.1576493403959465)
Recommended: 98 (score:0.15302255047819316)
Recommended: 169 (score:0.14960471234632747)
