In [None]:
# Download and install Python packages needed for this Jupyter Notebook

!pip install openai tiktoken neo4j

In [None]:
import os
import openai
from neo4j import GraphDatabase
import tiktoken

openai.api_key = os.getenv("OPENAI_API_KEY")

# Uncomment the line below for verification that your OpenAI API key was successfully imported
#openai.Model.list()

# Your OpenAI API key can also be manually entered using the command below
#openai.api_key = "Your Open AI key between the quotation marks"


In [None]:
# The block connect the Jupyter Notebook to your Neo4j Database
# Note: Your Neo4j Database must be running and accepting connections
# Note: This example is for connecting to a local instance of Neo4j
# More information on interfacing with can be found at
# https://neo4j.com/docs/python-manual/current/connect/

uri = 'bolt://localhost:7687'
username = 'neo4j'
password = 'password'
driver = GraphDatabase.driver(uri, auth=(username, password))

In [None]:
#This block calulates the dollar cost of obtaining Vector Embeddings for 
#abstracts within the database that do not currently have an embedding

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

records, summary, keys = driver.execute_query(
    "MATCH (n:Work) WHERE n.embedding IS NULL AND n.abstract IS NOT NULL \
    RETURN n.abstract, n.id")

abstract_count = 0 
token_count = 0 

# Calculate number of tokens for each record 
for record in records:  
    abstract = record.data('n.abstract').get('n.abstract')
    token_count += num_tokens_from_string(abstract, "cl100k_base")

print("It will cost approximately - $" + str(round((token_count/1000)*0.0001,3))\
      + " Dollars to process abstracts into Vector Embeddings, assuming a cost "+
      "of $0.0001 per 1000 tokens")

In [None]:
# This block identifies Work nodes with abstracts that lack vector embeddings
# The abstract and id for these Work nodes are returned 
# The abstract is sent to OpenAI to return the vector embedding which is added
# as an 'embedding' property to the respective Work node

records_for_node_embedding_start, summary_for_node_embedding_start, \
    keys_for_node_embedding_start = driver.execute_query(\
    "MATCH (n:Work) WHERE n.embedding IS NULL AND n.abstract <> '' \
    RETURN COUNT(n)")
eligible_node_without_embedding = records_for_node_embedding_start[0][0]

print("Nodes with abstracts that do not have vector embeddings - " + \
      str(eligible_node_without_embedding))
print("These will be processed in batches of 10")

# Returns Work node id and abstract for Work nodes with abstracts
# that do not have vector embeddings
# This is done in batches of 10
      
while eligible_node_without_embedding > 0:
    print("Processing embedding for node number - " + \
          str(eligible_node_without_embedding))
    records, summary, keys = driver.execute_query( \
        "MATCH (n:Work) WHERE n.embedding IS NULL AND n.abstract <> '' \
        AND n.abstract IS NOT NULL RETURN n.abstract, n.id LIMIT 10")
     
    # Send each record to OpenAI to create a Vector Embedding
    # NOTE: embedding is not captured as a variable in Python, only Neo4j
    for record in records:  

        node_id = record.data('n.id').get('n.id')
        abstract = record.data('n.abstract').get('n.abstract')
 
        neo4j_record, summary, keys = driver.execute_query("MATCH \
            (w:Work {id: $id}) WITH w \
            CALL db.create.setVectorProperty(w, 'embedding', $embedding) \
            YIELD node RETURN node", \
            id = node_id, \
            embedding = openai.Embedding.create( \
            input= abstract, model=\
            "text-embedding-ada-002")["data"][0]["embedding"])

        # Note: If you have a new OpenAI account, the number of queries per minute 
        # will be throttled. This can be addressed by including a sleep timer
        # The sleep function will need to be imported - from time import sleep 
        #sleep(41.2)

    eligible_node_without_embedding -= 10

    if eligible_node_without_embedding > 0:
        print("Remaining nodes without embeddings - " + \
              str(eligible_node_without_embedding))
    else:
        print ("Vector embeddings have been added for all Work Nodes") 

In [None]:
# This block runs the command needed to create the Vector Search Index
# More information is available at https://neo4j.com/docs/cypher-manual/current/indexes-for-vector-search/
# Note: Only run this block a single time per database as only one index can be created

neo4j_record, summary, keys = driver.execute_query(\
    "CALL db.index.vector.createNodeIndex(\
    'abstract-embeddings', 'Work', 'embedding', 1536, 'cosine')")

In [None]:
# This block will find the closest 25 matches to the Work selected based on 
# the cosine similiarities of the vector embeddings for each abstract

# This will search across all Works, regardless of original source if
# metadata was imported using the data schema model provided by the
# Jupyter Notebooks available at 
# https://github.com/vtmike2015/Graph-Based-Literature-Review-Tool

# Users need to input the Id of the Work of interest to find the closest
# match. This is the 'id' property for a Work node, regardless of source

work_id = input('Please enter the Id of the Work you want to the closest 25 matches for')

records, summary, keys  = driver.execute_query(\
    "MATCH (w:Work) WHERE w.id = '" + work_id + \
    "' CALL db.index.vector.queryNodes('abstract-embeddings', 25, w.embedding)\
    YIELD node AS similarAbstract, score MATCH (n:Work)<-[:WROTE]-(a:Author) \
    WHERE n.id = similarAbstract.id AND w.id <> similarAbstract.id \
    RETURN w.id, w.display_name, w.abstract, score, n.id, n.display_name, \
    COLLECT(a.display_name) AS authors, n.publication_year, n.abstract, n.source LIMIT 25"
    )

original_work_id = records[0].data('w.id').get('w.id')
original_work_title = records[0].data('w.display_name').get('w.display_name')
original_work_abstract = records[0].data('w.abstract').get('w.abstract')

print("You selected '" + original_work_title + "' with Id - " + original_work_id)
print("This is the original work's abstract - " + original_work_abstract)
print("")

counter = 1

for record in records:
    
    node_id = record.data('n.id').get('n.id')
    score = record.data('score').get('score')
    title = record.data('n.display_name').get('n.display_name')
    abstract = record.data('n.abstract').get('n.abstract') 
    authors = record.data('authors').get('authors')
    source = record.data('n.source').get('n.source')
    
    print("Article Match Number " + str(counter) + " is - " + title)
    print("The Similarity score is " + str(round(score,5)))
    print("The Source is " + source)
    print("The ID is - " + node_id)
    print("The authors are - " + str(authors))
    print("This is the abstract - " + abstract)
    print("")
    counter += 1 