In [28]:
%pip install pandas openai langchain langchain-openai langchain-community neo4j jupyter tiktoken

Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.35-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchai

In [2]:
from neo4j import GraphDatabase
from openai import OpenAI
import tiktoken
import pandas as pd

In [None]:
username = 'neo4j'              # Neo4j username
password = ''                   # Neo4j password
uri = ''                        # Neo4j URI
openai_api_key = ''             # OpenAI API key.
encoding_name = 'cl100k_base'
embeddings_model = 'text-embedding-3-small'

auth = (username, password)

Get all text from graph

In [4]:
result = None

with GraphDatabase.driver(uri, auth=auth) as driver:
    result = driver.execute_query(
        '''
        MATCH (n) WHERE n.text IS NOT NULL
        RETURN elementId(n) AS id, n.text AS text
        '''
    )

In [5]:
text_data = []
for res in result.records:
    text_data.append({'id': res['id'], 'text': res['text']})

In [6]:
len(text_data)

80

In [7]:
text_len = []
for text in text_data:
    text_len.append(len(text['text']))

In [8]:
max(text_len)

6915

measure token lengths

In [9]:
def num_tokens_from_string(text, enc_name):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(enc_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens

In [10]:
token_len = []
for item in text_data:
    token_len.append(num_tokens_from_string(item['text'], encoding_name))

In [11]:
max(token_len)

1336

Maximum token size is less than maximum input allowed for `text-embedding-3-small` model (https://platform.openai.com/docs/guides/embeddings#embedding-models), so there is no need for chunking.

next, convert the list to dataframe

In [12]:
text_df = pd.DataFrame.from_dict(text_data)

In [13]:
text_df.head(5)

Unnamed: 0,id,text
0,4:25f52c68-6701-4c88-b184-1749f4be899a:25,Title: XYZ IoT Manufactr\nAuthor: Muhammad Ari...
1,4:25f52c68-6701-4c88-b184-1749f4be899a:26,What is Manufactr?\nXYZ IoT Manufactr provides...
2,4:25f52c68-6701-4c88-b184-1749f4be899a:27,Smart Manufacturing OEE\nOEE (Overall Equipmen...
3,4:25f52c68-6701-4c88-b184-1749f4be899a:28,"The image depicts an industrial scene, likely ..."
4,4:25f52c68-6701-4c88-b184-1749f4be899a:29,Smart Manufacturing CMMS\nA \ncomputerized mai...


Get vector embedding from OpenAI

In [14]:
client = OpenAI(api_key=openai_api_key)

In [15]:
def get_embedding(text, model=embeddings_model):
    text = text.replace('\n', " ")
    return client.embeddings.create(
        input=[text],
        model=model
    ).data[0].embedding

text_df['embedding'] = text_df.text.apply(lambda x: get_embedding(x, model=embeddings_model))


In [16]:
text_df.head(5)

Unnamed: 0,id,text,embedding
0,4:25f52c68-6701-4c88-b184-1749f4be899a:25,Title: XYZ IoT Manufactr\nAuthor: Muhammad Ari...,"[6.934024713700637e-05, 0.029017282649874687, ..."
1,4:25f52c68-6701-4c88-b184-1749f4be899a:26,What is Manufactr?\nXYZ IoT Manufactr provides...,"[-0.0008101155981421471, 0.053481556475162506,..."
2,4:25f52c68-6701-4c88-b184-1749f4be899a:27,Smart Manufacturing OEE\nOEE (Overall Equipmen...,"[0.03748495504260063, 0.04553172364830971, 0.0..."
3,4:25f52c68-6701-4c88-b184-1749f4be899a:28,"The image depicts an industrial scene, likely ...","[-0.018227674067020416, 0.009691787883639336, ..."
4,4:25f52c68-6701-4c88-b184-1749f4be899a:29,Smart Manufacturing CMMS\nA \ncomputerized mai...,"[-0.016483249142766, 0.052513863891363144, 0.0..."


## Load vector embeddings to Neo4j

### Create vector index

In [19]:
with GraphDatabase.driver(uri, auth=auth) as driver:
    driver.execute_query(
        '''
        CREATE VECTOR INDEX ChunkVectorIndex IF NOT EXISTS
        FOR (n:Chunk)
        ON (n.embedding)
        OPTIONS {
            indexConfig: {
                `vector.dimensions`: 1536,
                `vector.similarity_function`: 'cosine'
            }
        }
        '''
    )

### Load embeddings to the graph. New nodes labeled `Chunk` with the text and embedding information are created.

In [35]:
with GraphDatabase.driver(uri, auth=auth) as driver:
    res = driver.execute_query(
        '''
        UNWIND $row AS row
        MATCH (n) WHERE elementId(n)=row.id
        MERGE (m:Chunk)-[:PARTS_OF]->(n)
        SET m.embedding=row.embedding,m.text=row.text
        RETURN count(n) AS total_nodes_updated
        ''',row=text_df.to_dict(orient='records')
    )

    print(res.records)

[<Record total_nodes_updated=40>]


# Semantic Search

## Simple search

In [17]:
from langchain.vectorstores import Neo4jVector
from langchain.vectorstores.neo4j_vector import SearchType
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document
from typing import List, Tuple

def to_df(results: List[Tuple[Document, float]]):
    return pd.DataFrame({
        "score": [r[1] for r in results],
        "text": [r[0].page_content for r in results]
    })

Reference: https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.neo4j_vector.Neo4jVector.html#langchain_community.vectorstores.neo4j_vector.Neo4jVector

In [18]:
db = Neo4jVector.from_existing_index(
    url=uri,
    username=username,
    password=password,
    embedding=OpenAIEmbeddings(model=embeddings_model, api_key=openai_api_key),
    index_name='ChunkVectorIndex',
    node_label='Chunk',
    embedding_node_property='embedding',
    text_node_property='text',
    search_type=SearchType.VECTOR
)

In [19]:
query = 'what is IOT Manufactr?'
results = db.similarity_search_with_score(query, k=3)

In [20]:
results_df = to_df(results)
results_df

Unnamed: 0,score,text
0,0.824573,What is Manufactr?\nXYZ IoT Manufactr provides...
1,0.79817,"Overview\n \nAt our company, we specialize in ..."
2,0.772615,Products\nXYZ IoT offers the following product...


In [22]:
print(f'{results_df.at[0, 'text']} \n score: {results_df.at[0, 'score']}')

What is Manufactr?
XYZ IoT Manufactr provides a one-stop solution for the Smart Manufacturing. This means that customers don't have to worry about connecting various parts of a complex systems. XYZ IoT Manufactr features and applications can be customized to match the business processes, requirements, and integrated into the company's database or backend systems. XYZ IoT Manufactr solution provides customized solutions to meet customer needs. The implementation of this solution begins with a site visit to assess the field conditions in order to provide the right Smart Manufacturing solutions and a suitable roadmap. 
 score: 0.8245730400085449


## More complex search with multi-hop retrieval using KG

In [24]:
db_mh = Neo4jVector(
    url=uri,
    username=username,
    password=password,
    embedding=OpenAIEmbeddings(model=embeddings_model, api_key=openai_api_key)
)

In [38]:
query2 = 'Who to contact for issues with cold chain monitoring systems?'
query2_vector = db_mh.embedding.embed_query(text=query2)

In [39]:
query2_vector[:5]

[-0.02918504737317562,
 0.012809954583644867,
 0.05352035164833069,
 -0.025230199098587036,
 -0.04387860745191574]

In [40]:
results_mh = db_mh.query('''
    CALL db.index.vector.queryNodes('ChunkVectorIndex', 3, $queryVector)
    YIELD node AS similarChunks, score
                         
    MATCH (similarChunks)-[:PARTS_OF*2]->(page:WebPage)
    MATCH (page)<-[:PARTS_OF]-(webpart:WebPart {content: 'People'})
    MATCH (webpart)-[:LINKED_TO]->(e:Employee)
    RETURN 
        e.name AS name, 
        e.email AS email, 
        e.role AS role, 
        score, 
        page.name AS reference
    ORDER BY score DESC
''', params={'queryVector': query2_vector}
)

In [41]:
results_mh

[{'name': 'Muhammad Arif Wicaksana',
  'email': 'arif@muarwi.onmicrosoft.com',
  'role': 'Head of Product',
  'score': 0.7429332733154297,
  'reference': 'IoT-Asset-Performance-Management.aspx'},
 {'name': 'Pramoedya Toer',
  'email': 'pram@muarwi.onmicrosoft.com',
  'role': 'Product Manager',
  'score': 0.7429332733154297,
  'reference': 'IoT-Asset-Performance-Management.aspx'},
 {'name': 'Muhammad Arif Wicaksana',
  'email': 'arif@muarwi.onmicrosoft.com',
  'role': 'Head of Product',
  'score': 0.6936264038085938,
  'reference': 'IoT-Environment.aspx'},
 {'name': 'Pramoedya Toer',
  'email': 'pram@muarwi.onmicrosoft.com',
  'role': 'Product Manager',
  'score': 0.6936264038085938,
  'reference': 'IoT-Environment.aspx'},
 {'name': 'Muhammad Arif Wicaksana',
  'email': 'arif@muarwi.onmicrosoft.com',
  'role': 'Head of Product',
  'score': 0.6929721832275391,
  'reference': 'IoT-Environment.aspx'},
 {'name': 'Pramoedya Toer',
  'email': 'pram@muarwi.onmicrosoft.com',
  'role': 'Product 