In [62]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [63]:
restos_df = pd.read_csv("resto_df.csv")

In [89]:
restos_df.shape

(200, 2)

In [64]:
restos_df.head()

Unnamed: 0,Unnamed,resto_id,dishes
0,0,0,Mirabelle Leon Beyer Masthuhnbrust Singapur in...
1,1,1,Piel Bro's. Lager Scrambled Eggs and Onions FL...
2,2,2,"3 fried eggs and ham or bacon, bread With Pota..."
3,3,3,Braised free range chicken fresh polenta Beef ...
4,4,4,Spring duckling Cream or Country Gravy Oyster ...


In [65]:
restos_df = restos_df.drop(["Unnamed"], axis=1)

In [66]:
restos_df.head()

Unnamed: 0,resto_id,dishes
0,0,Mirabelle Leon Beyer Masthuhnbrust Singapur in...
1,1,Piel Bro's. Lager Scrambled Eggs and Onions FL...
2,2,"3 fried eggs and ham or bacon, bread With Pota..."
3,3,Braised free range chicken fresh polenta Beef ...
4,4,Spring duckling Cream or Country Gravy Oyster ...


In [67]:
restos_df["dishes"] = restos_df["dishes"].str.lower()

In [68]:
restos_df.head()

Unnamed: 0,resto_id,dishes
0,0,mirabelle leon beyer masthuhnbrust singapur in...
1,1,piel bro's. lager scrambled eggs and onions fl...
2,2,"3 fried eggs and ham or bacon, bread with pota..."
3,3,braised free range chicken fresh polenta beef ...
4,4,spring duckling cream or country gravy oyster ...


In [69]:
import nltk

In [70]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
restos_df["dishes"] = restos_df["dishes"].map(lambda x: nltk.word_tokenize(x))

In [72]:
restos_df.head()

Unnamed: 0,resto_id,dishes
0,0,"[mirabelle, leon, beyer, masthuhnbrust, singap..."
1,1,"[piel, bro, 's, ., lager, scrambled, eggs, and..."
2,2,"[3, fried, eggs, and, ham, or, bacon, ,, bread..."
3,3,"[braised, free, range, chicken, fresh, polenta..."
4,4,"[spring, duckling, cream, or, country, gravy, ..."


### Removing punctuation marks

In [73]:
restos_df["dishes"] = restos_df["dishes"].map(lambda x: [word.lower() for word in x if word.isalpha()])

In [74]:
restos_df.head()

Unnamed: 0,resto_id,dishes
0,0,"[mirabelle, leon, beyer, masthuhnbrust, singap..."
1,1,"[piel, bro, lager, scrambled, eggs, and, onion..."
2,2,"[fried, eggs, and, ham, or, bacon, bread, with..."
3,3,"[braised, free, range, chicken, fresh, polenta..."
4,4,"[spring, duckling, cream, or, country, gravy, ..."


### Stemming the words

In [75]:
porter = nltk.PorterStemmer()

In [76]:
restos_df["dishes"] = restos_df["dishes"].map(lambda x: [porter.stem(t) for t in x])

In [77]:
restos_df.head()

Unnamed: 0,resto_id,dishes
0,0,"[mirabel, leon, beyer, masthuhnbrust, singapur..."
1,1,"[piel, bro, lager, scrambl, egg, and, onion, f..."
2,2,"[fri, egg, and, ham, or, bacon, bread, with, p..."
3,3,"[brais, free, rang, chicken, fresh, polenta, b..."
4,4,"[spring, duckl, cream, or, countri, gravi, oys..."


### Removing stop words

In [78]:
from nltk.corpus import stopwords

In [79]:
stop_words = set(stopwords.words('english')) 

In [80]:
restos_df["dishes"] = restos_df["dishes"].map(lambda x: [w for w in x if not w in stop_words])

In [81]:
restos_df.head()

Unnamed: 0,resto_id,dishes
0,0,"[mirabel, leon, beyer, masthuhnbrust, singapur..."
1,1,"[piel, bro, lager, scrambl, egg, onion, flambe..."
2,2,"[fri, egg, ham, bacon, bread, potato, salad, e..."
3,3,"[brais, free, rang, chicken, fresh, polenta, b..."
4,4,"[spring, duckl, cream, countri, gravi, oyster,..."


In [96]:
restos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   resto_id  200 non-null    int64 
 1   dishes    200 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.2+ KB


In [90]:
restos_df["dishes"][199]

'ockfen,bockstein,gebert,fuder,cinzano,dri,mit,kirschwass,chateau,reynon,bordeaux,carr,biscuit,cup,whole,sea,bass,steam,spring,onion,ginger,per,lb,fri,ham,egg,sandwich,assort,relish,salmon,fresco,frito,la,parrilla,schweinskeul,gebraten,rotkohl,essenc,leek,soup,heringfilet,nach,art,de,haus,mariniert,mit,gewurzgurken,zwiebeln,und,sauerrahm,serviert,al,beilag,emerald,mussel,steam,whole,broil,fri,spring,chicken,pud,princess,sauc,mader,galantin,capon,florida,salad,extra,dri,red,cap,littl,neck,clam,catsup,buzzard,bay,half,shell,oyster,chateau,gazin,consomm,royal,tureen,scholl,gebk,majonais,salat,trommer,beer,cold,main,lobster,half,caper,mayonnais,potato,salad,egg,tomato,wedg,celeri,heart,rosbif,aux,cornichon,heringssalat,mit,ei,garniert,bake,bluefish,la,creol,en,bordur,bake,stuf,tomato,vanderbilt,fllet,sole,fri,sauc,tartar,cold,beef,la,mode,jelli,slice,tomato,dill,pickl,consommé,impérial,eier,duval,fri,filet,sole,au,gratin,duc,de,montebello,brut,ostarettir,meo,kexi,rugbrauoi,smjori,beefsteak

In [82]:
restos_df["dishes"] = restos_df["dishes"].map(lambda x: ','.join(map(str, x)))

In [83]:
restos_df.head()

Unnamed: 0,resto_id,dishes
0,0,"mirabel,leon,beyer,masthuhnbrust,singapur,curr..."
1,1,"piel,bro,lager,scrambl,egg,onion,flambe,person..."
2,2,"fri,egg,ham,bacon,bread,potato,salad,extra,chi..."
3,3,"brais,free,rang,chicken,fresh,polenta,beef,gou..."
4,4,"spring,duckl,cream,countri,gravi,oyster,clam,c..."


In [84]:
all_dishes_string = restos_df["dishes"].sum()

In [85]:
all_dishes_string

ern,wine,went,bro,filet,de,sole,anglais,walewska,curacoa,white,corn,beef,cabbag,bonn,mare,domain,drum,roll,apricot,strwberri,parfait,fresh,fruit,tongu,sandwich,coup,montreuil,old,fashion,indian,pud,whip,cream,fresh,shrimp,salad,hard,boil,egg,bowl,garni,schildkrotensupp,lacroix,holland,auf,toast,hollandisch,old,musti,ale,titl,product,origin,boston,poulett,porterhous,steak,two,three,ri,de,veau,la,toulous,stegt,salad,à,la,crème,bratwurst,sauerkraut,cigarett,egyptian,small,packag,hesit,ice,cream,breast,chicken,saut,sou,cloch,alexand,cocktail,brandi,creme,de,cacao,cream,bisquit,dubouch,vsop,grill,dinner,steak,maitr,filet,mignon,la,rose,philadelphia,pepper,pot,patisseri,la,bonn,femm,epinard,en,branch,la,creme,lamb,chop,potato,broccoli,tomato,string,beanboneless,sardin,per,box,johnni,walker,black,label,scotch,lime,lemon,sherbet,feuillet,aux,ecreviss,great,caribbean,cocktail,name,origin,siboney,indian,kirsch,punch,spaghetti,italian,style,new,asparagu,hollandais,butter,sauc,minc,ham,egg,med,sir

In [91]:
def Convert(string): 
    li = list(string.split(",")) 
    return li

In [92]:
all_dishes_list = Convert(all_dishes_string)

In [93]:
len(all_dishes_list)

50223

In [94]:
unique_dishes = set(all_dishes_list)
print(len(unique_dishes))

8292


In [95]:
unique_dishes

{'velout',
 'ger',
 'type',
 'like',
 'schlosskartoffel',
 'wild',
 'matj',
 'saumone',
 'ausgelostem',
 'kaa',
 'dauphin',
 'tin',
 'smor',
 'harvey',
 'medail',
 'modèn',
 'ostarettir',
 'plumle',
 'schweinefleisch',
 'linsensupp',
 'diamand',
 'kitzing',
 'mcarthur',
 'weinbergschnecken',
 'greno',
 'schinkenplatt',
 'stetten',
 'banana',
 'sahnensteak',
 'kraftig',
 'hamonisch',
 'escarol',
 'gazin',
 'spejlaeg',
 'lapin',
 'rape',
 'rpr',
 'baden',
 'lubeck',
 'symphoni',
 'erlesenen',
 'dutch',
 'marzen',
 'sau',
 'coquil',
 'tavel',
 'tsai',
 'champenois',
 'everett',
 'ward',
 'manhattannew',
 'alcazar',
 'cellar',
 'fine',
 'centuri',
 'steinhag',
 'arrivag',
 'jurancon',
 'clicquet',
 'schelpen',
 'saut',
 'strasbourgeois',
 'paprikaschnitzel',
 'gravillon',
 'pollastra',
 'fungulaubenheim',
 'entrecôt',
 'weston',
 'maisocaf',
 'layer',
 'dunkel',
 'garnier',
 'cervelatwurst',
 'prawn',
 'shave',
 'crest',
 'circumst',
 'eventail',
 'midinett',
 'tokay',
 'pfirsichnektar',
 

### Creating resto dishes dataframe

In [97]:
resto_dishes_df = pd.DataFrame([], columns=list(unique_dishes), index = restos_df["resto_id"].tolist())

In [99]:
resto_dishes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 199
Columns: 8292 entries, velout to gefuelt
dtypes: object(8292)
memory usage: 12.7+ MB


In [98]:
resto_dishes_df.head()

Unnamed: 0,velout,ger,type,like,schlosskartoffel,wild,matj,saumone,ausgelostem,kaa,...,feket,servic,commun,earli,blanchail,rahmschnitzel,beerenausles,oriental,poorboy,gefuelt
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [100]:
resto_ids = restos_df["resto_id"].tolist()

In [112]:
unique_dishes_columns = resto_dishes_df.columns.tolist()

In [145]:
for resto_id in resto_ids:
    resto_dishes = Convert(restos_df.loc[resto_id,"dishes"])
    for dish_column in unique_dishes_columns:
        if dish_column in resto_dishes:
            resto_dishes_df.loc[resto_id,dish_column] = 1
        else:
            resto_dishes_df.loc[resto_id,dish_column] = 0


In [146]:
resto_dishes_df.head()

Unnamed: 0,velout,ger,type,like,schlosskartoffel,wild,matj,saumone,ausgelostem,kaa,...,feket,servic,commun,earli,blanchail,rahmschnitzel,beerenausles,oriental,poorboy,gefuelt
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Finding similar restos

![alt text](cosine.png "Cosine Similarity")

In [153]:
# import cosine_similarity from sklearn
from sklearn.metrics.pairwise import cosine_similarity

def similar_restos(resto_id, interactions_matrix):
    
    # compute similarity of each user to the provided user
    similarity = []
    for resto in range(0, interactions_matrix.shape[0]):
        sim = cosine_similarity([interactions_matrix.loc[resto_id]], [interactions_matrix.loc[resto]])
        similarity.append((resto,sim))
        
    # sort by similarity
    similarity.sort(key=lambda x: x[1], reverse= True)
    
    # create list of just the user ids
    most_similar_restos = [tup[0] for tup in similarity]
    
    # create list of similarity score
    
    similarity_score = [tup[1] for tup in similarity]
    
    # remove the resto's own id
    most_similar_restos.remove(resto_id)
    
    
    # remove the resto's own similarity score
    similarity_score.remove(similarity_score[0])
    
       
    return most_similar_restos, similarity_score

In [160]:
def test_resto(resto_id, num_of_restos):
    similar_restos_ids = similar_restos(resto_id, resto_dishes_df)[0][:num_of_restos]
    print(similar_restos_ids)
    print("resto dishes : " + restos_df.loc[resto_id,"dishes"])
    for sim_resto_id in similar_restos_ids:
        print("similar resto dishes : "+ str(sim_resto_id)+ " : " + restos_df.loc[sim_resto_id,"dishes"])
    print(similar_restos(resto_id, resto_dishes_df)[1][:num_of_restos])

In [161]:
test_resto(56,5)

[168, 11, 144, 197, 98]
resto dishes : half,broil,spring,chicken,au,beurr,bread,beef,tongu,czarin,château,léovil,la,case,mc,lalla,rook,pan,de,maiz,white,zweiback,import,wuerzburg,small,heart,lettuc,salad,kalamaria,toursi,appetis,young,fowl,smoke,bacon,new,england,pud,lemon,sauc,canap,russ,budweis,beer,fricasse,chicken,toast,broil,fresh,pike,mustard,butter,parsley,potato,glace,pistach,à,la,français,fri,sausag,sauerkraut,fri,shad,roe,bacon,gedenstet,aprikosen,shellfish,cold,platter,half,main,lobster,florida,shrimp,fresh,crabmeat,cherryston,clam,lettuc,celeri,mayonnais,cocktail,sauc,brown,palac,waffl,spring,lamb,chop,broil,california,salad,cottag,chees,assort,fruit,melon,gross,gedeck,kannchen,kaffe,oder,tee,brot,und,butter,dazu,einen,kleinen,aufschnitt,und,ei,nectaros,brown,vintner,cold,salmon,major,handi,sauc,preserv,satsuma,plum,mix,drink,le,carr,roti,persil,pomm,nouvel,per,new,asparagu,au,beurr,parmesan,claret,julien,fri,flounder,anchovi,sauc,berri,fruit,season,sour,cream,russian,raisi