

# Inicjalizacja
Do zrealizowania poniższego kodu potrzebny jest utworzony nowy, pusty projekt w darmowym środowisku zapenianym przez Neo4j (https://neo4j.com/sandbox/). po utworzeniu takiego pojektu niezbędne jest w późniejszej części podanie danych dostępowych do utworzonego projektu. Środowisko Neo4j udostępnia również dobrą wizualizację stowrzonej bazy danych za pomocą narzedzia Bloom. Więcej szczegółów na temat realizacji tego projketu znajduje się poniżej lub w dodatkowych źródłach:
- link do pracy inżynierskiej Overleaf - https://www.overleaf.com/read/bydgzbrmwwtr
- link do repozytorium z aplikacją do reprezentowania wyszukań - https://github.com/szuzanna/Entity-Set-Expander-with-Graph-Embeddings
- https://towardsdatascience.com/node-embeddings-node2vec-with-neo4j-5152d3472d8e
- https://neo4j.com/developer/kb/import-csv-locations/


##Instalacja i import potrzebnych bibliotek 

In [1]:
pip install neo4j 

Collecting neo4j
[?25l  Downloading https://files.pythonhosted.org/packages/36/f7/3c0b20ad7cdeac89d44e5380b0b4507995b1aec843692e3e76dd6cd1c638/neo4j-4.2.1.tar.gz (69kB)
[K     |████▊                           | 10kB 23.5MB/s eta 0:00:01[K     |█████████▍                      | 20kB 20.1MB/s eta 0:00:01[K     |██████████████▏                 | 30kB 10.9MB/s eta 0:00:01[K     |██████████████████▉             | 40kB 9.1MB/s eta 0:00:01[K     |███████████████████████▋        | 51kB 4.4MB/s eta 0:00:01[K     |████████████████████████████▎   | 61kB 4.9MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.7MB/s 
Building wheels for collected packages: neo4j
  Building wheel for neo4j (setup.py) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-4.2.1-cp36-none-any.whl size=95190 sha256=f3a399c482d964cd1fded26e626c1113f1e434ba5a3964f5a5ca8c5324f73fcf
  Stored in directory: /root/.cache/pip/wheels/00/a8/69/083e04c2ffc0a7e026c9356cd5bb54c65128f3e08ef8371be0


In [2]:
#Import biblioteki do tworzenia i obsługi modelu word2vec
from gensim.models import Word2Vec
#Import bibliteki niezbędnej do komunikacji z bazą danych w Neo4j
from neo4j import GraphDatabase
import pandas as pd
import time

## Podanie namiarów na pustą baze danych na Neo4j i połączenie się z nią 

In [17]:
host = 'bolt://52.91.118.4:34331'
user = 'neo4j'
password = 'contact-relay-journey'
driver = GraphDatabase.driver(host,auth=(user, password))

#Tworzenie bazy danych i wczytaywanie danych

## Podanie schamtu bazy i załadowanie do niej danych z .csv na githubie

In [4]:
#Zaptenie do bazy wtorzące jej schemat
graph_schema_query = """

CALL apoc.schema.assert( 
    null, 
    {Ingredient:['name'], Dish:['id'], DishType:['name']})

"""

#Zapytanie do bazy pobierajce dane z pliku .csv i wczytanie tych danych do bazy
graph_import_query = """

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/szuzanna/neo4j/main/newfood.csv" as row 
CREATE (d:Dish{id:row.id, name:row.title}) 
SET d += apoc.map.clean(row, ['id','title','dishTypes','ingredients'],[]) 
FOREACH (i in split(row.ingredients,',') | MERGE (in:Ingredient{name:toLower(replace(i,'-',' '))}) 
                                           MERGE (in)<-[:HAS_INGREDIENT]-(d)) 
FOREACH (dt in split(row.dishTypes,',')  | MERGE (dts:DishType{name:dt}) 
                                           MERGE (dts)<-[:DISH_TYPE]-(d))

"""

with driver.session() as session:
    session.run(graph_schema_query)
    session.run(graph_import_query)

## Ilość węzłów i relacji w bazie

In [5]:
#Zapytanie zwracające ilość wezłów oraz ilość połączeń jaka występuje w bazie
tmp = pd.DataFrame()
with driver.session() as session:
    results = session.run("""
    CALL apoc.meta.stats() YIELD nodeCount 
    CALL apoc.meta.stats() YIELD relCount 
    RETURN nodeCount, relCount 
    """)
    tmp = pd.DataFrame([dict(result) for result in results])

print(tmp)

   nodeCount  relCount
0       3682     23783


## Ilość węzłów poszecgólnego typu

In [6]:
#Zapytanie zwracające ilości węzłów każdego z typów
tmp = pd.DataFrame()
with driver.session() as session:
    results = session.run("""
    CALL apoc.meta.stats() YIELD labels
    RETURN labels
    """)
    tmp = results.value('labels')

print(tmp)

[{'Ingredient': 1937, 'DishType': 23, 'Dish': 1722}]


#Przydatne, przykładowe statyski z bazy pozwalające zapoznać się z jej zawartością

##Najpopularniejsze składniki

In [7]:
#Zapytanie zwracające 20 węzłów typu składnik z najwiekszą ilością relacji wchodzacych do danego węzła
tmp = pd.DataFrame()
with driver.session() as session:
    results = session.run("""
    MATCH (n:Ingredient)
    RETURN n.name as ingredient, size((n)<--()) as mentions 
    ORDER BY mentions DESC
    LIMIT 20
    """)
    tmp = pd.DataFrame([dict(result) for result in results])


print(tmp)

                ingredient  mentions
0                olive oil       954
1                   garlic       425
2                     salt       405
3                   butter       397
4              lemon juice       373
5          salt and pepper       347
6                  anchovy       341
7          unsalted butter       305
8            garlic cloves       293
9                   capers       267
10             kosher salt       247
11                 parsley       177
12             bell pepper       177
13                   onion       171
14            black pepper       171
15                   water       158
16            garlic clove       154
17               anchovies       154
18              lemon zest       147
19  extra virgin olive oil       145


##Najmniej popularne składniki

In [8]:
#Zapytanie zwracające 20 węzłów typu składnik z najmniejszą ilością relacji wchodzacych do danego węzła
tmp = pd.DataFrame()
with driver.session() as session:
    results = session.run("""
    MATCH (n:Ingredient)
    RETURN n.name as ingredient, size((n)<--()) as mentions 
    ORDER BY mentions 
    LIMIT 20
    """)
    tmp = pd.DataFrame([dict(result) for result in results])


print(tmp)

                           ingredient  mentions
0                         beet greens         1
1                    dandelion greens         1
2                            pinenuts         1
3                          cane sugar         1
4                     extra firm tofu         1
5                         pinto beans         1
6                            sprinkle         1
7                        green chilli         1
8                          loin roast         1
9                     strained yogurt         1
10                        top sirloin         1
11                      acini di pepe         1
12                      seltzer water         1
13                   remove from oven         1
14                         rice syrup         1
15                         frying oil         1
16  pomodori al forno con le acciughe         1
17              fire roasted tomatoes         1
18                       caperberries         1
19                    hard boiled egg   

##Potrawy z największa ilością składników

In [9]:
tmp = pd.DataFrame()
with driver.session() as session:
    results = session.run("""
    MATCH (d:Dish)
    RETURN d.name as dish, size((d)-->()) as mentions 
    ORDER BY mentions DESC
    LIMIT 20
    """)
    tmp = pd.DataFrame([dict(result) for result in results])


print(tmp)

                                                 dish  mentions
0                       Poached Beef With Green Sauce       165
1              Sea Bass With Brown Shrimps & Couscous        50
2                                         Salsa Verde        44
3                                       Bouillabaisse        40
4                        Ceviche de Pescado: Two Ways        39
5                           Easy to Make Dinner Combo        37
6   Salad with Parmigiano-Reggiano and anchovy dre...        34
7                            New England Fish Chowder        32
8   Yagihashi's Black Sea Bass with Somen and Vege...        29
9                         Easy Baked Parmesan Tilapia        29
10                 McDevitt's Miso-Marinated Sea Bass        28
11  Kitchen Play: Salmon with Herbed Butter and Bu...        28
12  Baketard.com - Blog - Giorgio Locatelli's Bran...        28
13                                  Fish Taco Platter        27
14  Chef Race: Cornmeal-Crusted Catfish 

##Wypisanie wszystkich potraw zawierających wymienione składniki

In [10]:
#Zapytanie zwracające max 10 potraw zawierających składniki - "feta cheese", "zucchini"
with driver.session() as session:
    results = session.run("""
    WITH ["feta cheese", "zucchini"] as ingredients 
    MATCH (d:Dish) 
    WHERE all(i in ingredients WHERE exists( 
        (d)-[:HAS_INGREDIENT]->(:Ingredient {name: i}))) 
    RETURN d.name AS dish, 
           [(d)-[:HAS_INGREDIENT]->(i) | i.name] AS ingredients 
    ORDER BY size(ingredients)
    LIMIT 10
    """)
    tmp = pd.DataFrame([dict(result) for result in results])

print(tmp)


                          dish                                        ingredients
0    Striped Bass en Papillote  [lemon juice, striped bass, salt, black pepper...
1  Mediterranean Salad Platter  [feta cheese, red onions, zucchini, salt, pepp...


#Implementacja metody node2vec

## Tworzenie nowego grafu nazywamy go 'all' zawiera on wszytskie wezły a połaczenie miedzy nimi są nieskierowane, bez żadnej nazwy ani wagi.

In [11]:
with driver.session() as session:
    session.run("""CALL gds.graph.create('all', 
    '*', 
    {ALL_UNDIRECTED: {type:'*', orientation:'UNDIRECTED'}})""")

##Przykładowy model metody node2vec 

In [15]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_default.model")



13.968493s


In [14]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('aged balsamic vinegar', 0.6569322347640991),
 ('dried figs', 0.6526914834976196),
 ('lamb loin chops', 0.6453710794448853),
 ('dried chilli flakes', 0.6416180729866028),
 ('dried tomatoes', 0.6402533054351807),
 ('manchego', 0.6400256752967834),
 ('dried great northern beans', 0.6392632722854614),
 ('celery ribs', 0.6385805606842041),
 ('fusilli', 0.6376804113388062),
 ('red skinned potatoes', 0.6371135711669922)]

#Testowanie hiperparametrów

## Steps

In [16]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 10,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_steps10.model")



9.391337s


In [17]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('spaghetti', 0.7909681797027588),
 ('broccoli', 0.7891433835029602),
 ('cauliflower', 0.7649942636489868),
 ('Orecchiette (almost) Alla Pugliese', 0.7625887393951416),
 ('rosemary leaves', 0.7583848237991333),
 ('Montbrun Melange: Olive Bread With Basil, Marinated Tomatoes, A',
  0.7505795359611511),
 ('aged balsamic vinegar', 0.7482173442840576),
 ('Mediterranean Pasta with Broccoli', 0.7481319904327393),
 ('pecorino', 0.7459369897842407),
 ('corn bread mix', 0.7447943687438965)]

In [18]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 30,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_steps30.model")




26.930963s


In [19]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('corn bread mix', 0.5591591000556946),
 ('variety of crudité', 0.5486303567886353),
 ('dried great northern beans', 0.5480088591575623),
 ('red mullet', 0.541245698928833),
 ('penne', 0.540414035320282),
 ('unpitted', 0.5378836393356323),
 ('parlsley', 0.534924328327179),
 ('broccoli raab', 0.5311227440834045),
 ('gem squash', 0.5290196537971497),
 ('from donna deane', 0.5285034775733948)]

In [20]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 50,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_steps50.model")




44.245715s


In [21]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('parlsley', 0.5458523035049438),
 ('green onion tops', 0.5254658460617065),
 ('corn bread mix', 0.5242621898651123),
 ('dried figs', 0.5238118171691895),
 ('walnut halves', 0.5157029032707214),
 ('unpitted', 0.5103681087493896),
 ('rigatoni', 0.5016729235649109),
 ('flank steak', 0.5009630918502808),
 ('pork chops', 0.5009398460388184),
 ('cooked quinoa', 0.5005707740783691)]

##Walks

In [22]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 3,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_walks3.model")



8.507426s


In [23]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('basil leaves', 0.8340641260147095),
 ('fresh thyme', 0.8106650114059448),
 ('cherry tomatoes', 0.7979010343551636),
 ('rosemary', 0.7879591584205627),
 ('anchovy', 0.7878119945526123),
 ('Mediterranean Pasta with Broccoli', 0.7856109142303467),
 ('chicken livers', 0.7810992002487183),
 ('broccoli', 0.7801734805107117),
 ('roasted red peppers', 0.7781273126602173),
 ('flat leaf parsley leaves', 0.7774746417999268)]

In [24]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 10,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_walks10.model")



27.230416s


In [25]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('parlsley', 0.5299296975135803),
 ('dried chilli flakes', 0.5297034978866577),
 ('rigatoni', 0.5014475584030151),
 ('gem squash', 0.498526394367218),
 ('corn bread mix', 0.4982989728450775),
 ('Roasted Red Peppers With Anchovies And Olive Oil', 0.49650639295578003),
 ('octopus', 0.4945990741252899),
 ('aged balsamic vinegar', 0.4936733543872833),
 ('unpitted', 0.49271953105926514),
 ('serrano ham', 0.48888519406318665)]

In [26]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 15,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_walks15.model")



40.566639s


In [27]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('corn bread mix', 0.5659034252166748),
 ('butterfish', 0.5439046621322632),
 ('Grissini With Anchovy Dip', 0.5052345991134644),
 ('chilli flakes', 0.4945030212402344),
 ('Penne Puttanesca', 0.492698609828949),
 ('gem squash', 0.4865618944168091),
 ('green onion tops', 0.47895368933677673),
 ('octopus', 0.47438377141952515),
 ('basic bruschetta', 0.47076982259750366),
 ('Pan-fried Butterfish', 0.46971628069877625)]

##Pramaterty node2vec

In [9]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:1.5,
  return:0.5
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_sg_r05_io15.model")



13.430950s


In [10]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('octopus', 0.610521137714386),
 ('basic bruschetta', 0.5958776473999023),
 ('tuna in olive oil', 0.5926709771156311),
 ('anchovy fillet', 0.5907862186431885),
 ('penne', 0.5906845331192017),
 ('rib eye steaks', 0.5900151133537292),
 ('chickpea flour', 0.5889759063720703),
 ('dried chilli flakes', 0.5827676057815552),
 ('short bread cookie', 0.5815433263778687),
 ('fusilli', 0.5812956094741821)]

In [17]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.5,
  return:1.5
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_sg_r15_io05.model")



13.571535s


In [18]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('chilli flakes', 0.6721237897872925),
 ('fusilli', 0.6694563627243042),
 ('corn bread mix', 0.6618563532829285),
 ('baby artichokes', 0.6561764478683472),
 ('extra lean ground beef', 0.6536005735397339),
 ('basic bruschetta', 0.6535323858261108),
 ('penne', 0.6534193754196167),
 ('unpitted', 0.6489623785018921),
 ('dried tomatoes', 0.6466060876846313),
 ('dried chilli flakes', 0.6464773416519165)]

In [13]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:1.5,
  return:0.5
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=0, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_cbow_r05_io15.model")



6.575958s


In [14]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('spaghetti', 0.7681190371513367),
 ('parmigiano reggiano cheese', 0.7625564336776733),
 ('kalamata olives', 0.7486042976379395),
 ('Croutons with Tapenade', 0.7325603365898132),
 ('baguette', 0.7295055985450745),
 ('fresh basil', 0.7254635095596313),
 ('crushed red pepper', 0.7177973985671997),
 ('Quinoa Puttanesca', 0.717643141746521),
 ('Vermicelli with Sauce alla Sofia', 0.7164179086685181),
 ('anchovy', 0.7139915227890015)]

In [19]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.5,
  return:1.5
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=0, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_cbow_r15_io05.model")



6.702697s


In [20]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('garlic cloves', 0.9598323106765747),
 ('kalamata olives', 0.9462919235229492),
 ('baguette', 0.90604168176651),
 ('oil cured black olives', 0.8997437953948975),
 ('cherry tomatoes', 0.8943390250205994),
 ('parmigiano reggiano cheese', 0.8922959566116333),
 ('black olives', 0.891531765460968),
 ('Muffuletta', 0.890328049659729),
 ('flat leaf parsley leaves', 0.8749189376831055),
 ('garlic', 0.8733569383621216)]

##Size

In [28]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=50)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_size50.model")





15.832314s


In [29]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('corn bread mix', 0.7627268433570862),
 ('calamata olives', 0.7560825943946838),
 ('aleppo pepper', 0.7558659911155701),
 ('dried beans', 0.7459368109703064),
 ('octopus', 0.7452048659324646),
 ('serrano ham', 0.7305186986923218),
 ('dried chilli flakes', 0.7280954122543335),
 ('unpitted', 0.721984326839447),
 ('rigatoni', 0.72080397605896),
 ('baby arugula leaves', 0.7201208472251892)]

In [30]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=200)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_size200.model")



16.788485s


In [31]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('penne', 0.6590503454208374),
 ('lamb loin chops', 0.654988169670105),
 ('celery ribs', 0.6534550189971924),
 ('dried tomatoes', 0.6508963108062744),
 ('chianti', 0.6507312059402466),
 ('dried great northern beans', 0.6504547595977783),
 ('anchovy fillet', 0.6448917388916016),
 ('parlsley', 0.6442325115203857),
 ('dried chilli flakes', 0.6440320014953613),
 ('tuna in olive oil', 0.6439108848571777)]

In [32]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=300)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_size300.model")



20.384664s


In [33]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('penne', 0.6768109798431396),
 ('can whole peeled tomatoes', 0.6656123399734497),
 ('lamb loin chops', 0.6655267477035522),
 ('fusilli', 0.661930501461029),
 ('parlsley', 0.6586669683456421),
 ('anchovy fillet', 0.658545970916748),
 ('collard greens', 0.6583807468414307),
 ('elbow macaroni', 0.6572747230529785),
 ('celery ribs', 0.657006025314331),
 ('red onions', 0.6553921699523926)]

##Window

In [34]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=3, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_window3.model")



10.971559s


In [35]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('baby arugula leaves', 0.6600899696350098),
 ('hazelnuts', 0.651681661605835),
 ('dried great northern beans', 0.650213360786438),
 ('anise seeds', 0.6459257006645203),
 ('lamb loin chops', 0.6458263993263245),
 ('red chiles', 0.6451155543327332),
 ('dried chilli flakes', 0.6434712409973145),
 ('basic bruschetta', 0.6367380023002625),
 ('kalamata olives', 0.635761022567749),
 ('soak', 0.6334322690963745)]

In [36]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=10, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_window10.model")



19.372517s


In [37]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('dandelion greens', 0.6831833124160767),
 ('pork shoulder roast', 0.6806778311729431),
 ('branzini', 0.6769661903381348),
 ('green onion tops', 0.6761825084686279),
 ('bucatini pasta', 0.6747177839279175),
 ('skinless boneless chicken breasts', 0.6744940280914307),
 ('prunes', 0.6734331846237183),
 ('dried tomatoes', 0.672508716583252),
 ('sundried tomatoes', 0.6707227230072021),
 ('tomatillo', 0.668376088142395)]

In [38]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=15, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_window15.model")



22.804850s


In [39]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('chanterelles', 0.6593757271766663),
 ('broccoli raab', 0.656437873840332),
 ('walnut halves', 0.6509329080581665),
 ('red chilis', 0.6469266414642334),
 ('dried chilli flakes', 0.646693229675293),
 ('pork chops', 0.6450908184051514),
 ('ditalini', 0.6407184600830078),
 ('yellow wax beans', 0.640357494354248),
 ('wahoo', 0.6397199034690857),
 ('strained yogurt', 0.6387580633163452)]

##Typ

In [18]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=0, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_default0.model")



5.694632s


In [19]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('garlic cloves', 0.9590425491333008),
 ('kalamata olives', 0.9212204217910767),
 ('basil leaves', 0.858863890171051),
 ('cherry tomatoes', 0.8536038398742676),
 ('parmigiano reggiano cheese', 0.8515790700912476),
 ('Mediterranean Salad Platter', 0.8429174423217773),
 ('Greek Salad With Cherry Tomatoes', 0.840489387512207),
 ('Muffuletta', 0.837519109249115),
 ('baguette', 0.8374773263931274),
 ('bucatini', 0.834338903427124)]

#Niedeterminizm

In [64]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_niedet1.model")




13.886087s


In [65]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('roasted red peppers', 0.6548368334770203),
 ('canned cannellini beans', 0.6545486450195312),
 ('octopus', 0.6504326462745667),
 ('parlsley', 0.6487253904342651),
 ('chianti', 0.6480021476745605),
 ('globe eggplant', 0.6448535323143005),
 ('fusilli', 0.644520103931427),
 ('lamb loin chops', 0.6434299945831299),
 ('dried tomatoes', 0.6428088545799255),
 ('rigatoni', 0.6423267126083374)]

In [66]:
start = time.clock()
# Define random walk query
random_walks_query = """
MATCH (node)
CALL gds.alpha.randomWalk.stream('all', {
  start: id(node),
  steps: 15,
  walks: 5,
  mode:'node2vec',
  inOut:0.6,
  return:1.0
})
YIELD nodeIds
RETURN [id in nodeIds | gds.util.asNode(id).name] as walks
"""
# Fetch data from neo4j
with driver.session() as session:
    walks = session.run(random_walks_query)
    # Train the word2vec model
    clean_walks = [row['walks'] for row in walks]
model = Word2Vec(clean_walks, sg=1, window=5, size=100)
end = time.clock()
total = end - start
print("{0:02f}s".format(total))

model.save("foodEmbedding_niedet2.model")




13.987671s


In [67]:
model.most_similar('olive oil')

  """Entry point for launching an IPython kernel.


[('whole garlic cloves', 0.6647781133651733),
 ('lamb loin chops', 0.6602407097816467),
 ('tri tip roast', 0.6596821546554565),
 ('chickpea flour', 0.6538629531860352),
 ('wahoo', 0.6509428024291992),
 ('chilli flakes', 0.6505325436592102),
 ('red chiles', 0.6501350402832031),
 ('anise seeds', 0.6495939493179321),
 ('celery ribs', 0.6484204530715942),
 ('penne', 0.6468736529350281)]

#Inne

In [None]:
model.wv['olive oil']

array([-0.15738748, -0.1419574 , -0.11409396,  0.01048606, -0.24764684,
        0.01759799, -0.1062655 , -0.26186362, -0.09348107,  0.4380587 ,
        0.09442838,  0.17305371,  0.4261563 , -0.31597754, -0.05814086,
       -0.4174211 ,  0.07376011,  0.14177975, -0.02243035,  0.15933947,
        0.22278358, -0.0129896 ,  0.10233842,  0.10350949,  0.25057516,
       -0.14087789, -0.05341324,  0.01133251, -0.1369431 , -0.41712046,
        0.1322847 , -0.08088497,  0.2566648 ,  0.02181527,  0.0047138 ,
        0.01825834,  0.0513213 , -0.28244835, -0.03589903, -0.01623186,
        0.02182375, -0.17915241, -0.04042555,  0.2693285 ,  0.4246729 ,
        0.08029356, -0.00816643, -0.19569679, -0.22700413, -0.2910635 ,
        0.4195272 ,  0.1343985 ,  0.05375957,  0.05433673, -0.13591966,
        0.10230035,  0.4201857 , -0.13684915, -0.08646296,  0.12055176,
        0.22700074, -0.08864314, -0.34963968,  0.19826956,  0.16771397,
       -0.18007088,  0.16804825,  0.1470974 , -0.08083645, -0.08

In [73]:
print(clean_walks[:1000])

[['Anchovies Appetizer With Breadcrumbs & Scallions', 'anchovies', 'Bagna Cauda', 'side dish', 'Fresh Bloody Mary', 'flat leaf parsley leaves', 'Grilled Calamari on a Bed of White Beans', 'yellow onion', 'Saffron Sea Bass Bowl', 'chicken broth', 'Mixed Seafood Risotto', 'onion', 'Baked Cod With Mustard Crumbs', 'dijon mustard', 'Salsa Verde Recipe', 'side dish'], ['Anchovies Appetizer With Breadcrumbs & Scallions', 'antipasto', 'Salmon Club with Avocado Butter', 'salmon fillet', 'Chili-Rubbed Salmon', 'oregano', 'Kitchen Play: Salmon with Herbed Butter and Burst Tomato Sauce', 'shallot', 'Foil-Baked Sea Bass with Spinach', 'salt and pepper', 'Dinner Tonight: Kale Caesar Salad with Anchovies', 'main dish', 'Sake-Steamed Halibut with Dilled Carrots', 'dinner', 'Spring (?) Herb Cooking', 'shallots'], ['Anchovies Appetizer With Breadcrumbs & Scallions', 'snack', 'Stuffed Cherry Peppers', 'capers', 'Caponata-stuffed Zucchini', 'lemon wedges', 'Cod with Artichokes and Chickpeas', 'canned chi

In [None]:
model.wv.most_similar(['tomatoes','penne'])

  if np.issubdtype(vec.dtype, np.int):


[('turkey breast', 0.8138086795806885),
 ('cooked couscous', 0.8111531138420105),
 ('pissaladiere   a piece of france', 0.8110102415084839),
 ('macaroni', 0.8081371784210205),
 ('bucatini pasta', 0.8071438670158386),
 ('yellow peppers', 0.8017556071281433),
 ('skim milk mozzarella', 0.7999981641769409),
 ('servings: to 6 as a main course', 0.7998377084732056),
 ('dried chilli flakes', 0.7973442077636719),
 ('red chilis', 0.7963932752609253)]

In [None]:
type(model.most_similar(['tomatoes','fettucine'])[0][0])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


str

In [None]:
model.similarity("Lemon-Pepper Salmon","Lemon-Pepper Salmon")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.9999999