In [1]:
import nest_asyncio
nest_asyncio.apply()
import numpy as np
from scipy.spatial import distance
import pandas as pd
from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
%load_ext graph_notebook.magics

In [2]:
# Loading the database
articles = pd.read_csv('/home/jovyan/product-retail-graph/suramya/articles.csv')

In [3]:
# making a graph instance and connecting to database
graph = Graph()
remoteConn = DriverRemoteConnection('wss://database-prg-instance-1.cbtcet4rvkih.eu-central-1.neptune.amazonaws.com:8182/gremlin','g')
g = graph.traversal().withRemote(remoteConn)

In [4]:
# making a list of category and subcategory id's
cat_id = g.V().hasLabel('category').toList()
subcat_id = g.V().hasLabel('subcategory').toList()
cat_list = []
subcat_list =[]
for i in cat_id:
    cat_list.append(i.id)
for k in subcat_id:
    subcat_list.append(k.id)

In [6]:
# creating a dictionary of categories, keys as category id and value as embedding vector
Dict_cat = {}
for j in cat_list:
    y = np.random.random(20)
    Dict_cat[j] = y
cat_pairs = {l: Dict_cat[l] for l in list(Dict_cat)[:2]}
print(cat_pairs)

{'cat1': array([0.1330669 , 0.99575961, 0.65673303, 0.27218961, 0.77724964,
       0.79518356, 0.84628412, 0.91077527, 0.31798432, 0.1766595 ,
       0.65495237, 0.83477525, 0.55336384, 0.40627217, 0.65716944,
       0.68265778, 0.95817116, 0.74677859, 0.75779996, 0.70453839]), 'cat2': array([0.4610979 , 0.45772236, 0.09755177, 0.03107083, 0.73065556,
       0.81498093, 0.13096442, 0.91974547, 0.6688657 , 0.44055261,
       0.28880865, 0.37109123, 0.2840588 , 0.16019996, 0.54273717,
       0.51956736, 0.59637886, 0.55203922, 0.67267253, 0.20905768])}


In [7]:
# generating a random user query
user_query = np.random.random(20)
user_query

array([0.39511006, 0.85708991, 0.84189759, 0.16099554, 0.96171555,
       0.76698592, 0.16690833, 0.54296502, 0.46206652, 0.25051012,
       0.18762228, 0.79479718, 0.08349266, 0.04977463, 0.97056768,
       0.43827672, 0.40709177, 0.64935633, 0.82383644, 0.66952236])

In [8]:
# defining a function to get keys from the dictionary given its value
def get_key(dictionary,v):
    for key, value in dictionary.items():
        if (value ==v).all():
            return key
    return 0

In [9]:
# Finding element wise cosine similarity at category level
sim_with_cat = -9999999 # giving a high initial negative value to similarity score
for vec in Dict_cat.values():
    val = 1-distance.cosine(user_query,vec) # subtracting from 1 to get similarity from cosine distance
    if val > sim_with_cat:
        sim_with_cat = val
        most_sim_val = vec
most_sim_cat = get_key(Dict_cat,most_sim_val) #calling function 'get_key' to return key of highest similarity score value
print(sim_with_cat)
print(most_sim_cat)

0.8980704313810749
cat1


In [12]:
# getting outgoing nodes from the selected category with highest similarity score
all_subcat = g.V(most_sim_cat).out().toList()
all_subcat

[v[sub40], v[sub45], v[sub79]]

In [17]:
# creating a dictionary of subcategories, keys as category id and value as embedding vector
Dict_subcat = {}
for y in all_subcat:
    q = np.random.random(20)
    Dict_subcat[y] = q
subcat_pairs = {a: Dict_subcat[a] for a in list(Dict_subcat)}
print(subcat_pairs)

{v[sub40]: array([0.54945051, 0.11516799, 0.24789766, 0.42546653, 0.25536694,
       0.48522053, 0.78996063, 0.94410319, 0.53611626, 0.61873909,
       0.57983316, 0.55474408, 0.35766001, 0.51155505, 0.75041942,
       0.91276249, 0.35073216, 0.5462668 , 0.89927453, 0.07045241]), v[sub45]: array([0.33932378, 0.88846157, 0.54219295, 0.86837778, 0.24314231,
       0.25965387, 0.13106706, 0.59963717, 0.79898762, 0.56920886,
       0.47863273, 0.51214468, 0.84369693, 0.8570339 , 0.6410158 ,
       0.87057016, 0.63362836, 0.67953522, 0.22157012, 0.14757547]), v[sub79]: array([0.21917476, 0.88576714, 0.75678732, 0.03235444, 0.35575454,
       0.68178606, 0.92495478, 0.57744242, 0.34696274, 0.68726833,
       0.19395556, 0.15326289, 0.68838257, 0.33489249, 0.25909113,
       0.66761439, 0.88418912, 0.26469156, 0.81463479, 0.0350547 ])}


In [21]:
# element wise cosine similarity at subcategory level
sim_with_subcat = -9999999 # giving a high initial negative value to similarity score
for vec in Dict_subcat.values():
    val = 1-distance.cosine(user_query,vec) # subtracting from 1 to get similarity from cosine distance
    if val > sim_with_subcat:
        sim_with_subcat = val
        most_sim_val = vec
most_sim_subcat = get_key(Dict_subcat,most_sim_val) #calling function 'get_key' to return key of highest similarity score value
print(sim_with_subcat)
print(most_sim_subcat)

0.755989285206581
v[sub40]


In [None]:
%%gremlin
g.V()has('~id','most_s).out()