In [15]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import Docx2txtLoader


# Warning control
import warnings
warnings.filterwarnings("ignore")

In [16]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
# Note the code below is unique to this course environment, and not a 
# standard part of Neo4j's integration with OpenAI. Remove if running 
# in your own environment.
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [17]:
first_file_name = "C:/Users/SSK/Downloads/dwsample1-json.json"

In [18]:
first_file_as_object = json.load(open(first_file_name))

In [19]:
type(first_file_as_object)

dict

In [20]:
for k,v in first_file_as_object.items():
    print(k, type(v))

fruit <class 'str'>
size <class 'str'>
color <class 'str'>


In [21]:
print(first_file_as_object.keys())

dict_keys(['fruit', 'size', 'color'])


In [22]:
item1_text = first_file_as_object.get('item1')

In [23]:
if item1_text is not None:
    print(item1_text[0:1500])
else:
    print("item1_text is None, cannot slice.")


item1_text is None, cannot slice.


In [24]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [25]:
if item1_text is not None:
    item1_text_chunks = text_splitter.split_text(item1_text)
else:
    print("item1_text is None, cannot split text.")
    item1_text_chunks = []  # or handle it according to your needs


item1_text is None, cannot split text.


In [26]:
type(item1_text_chunks)

list

In [27]:
len(item1_text_chunks)

0

In [14]:
item1_text_chunks[0]

IndexError: list index out of range

In [28]:
import json

def split_form10k_data_from_file(file_name):
    # Load the JSON file content into a dictionary
    with open(file_name, 'r') as f:
        file_as_object = json.load(f)
    
    chunks = []  # To store all the chunks
    
    for item in ['item1', 'item1a', 'item7', 'item7a']:  # Iterate over the expected items
        print(f'Processing {item} from {file_name}') 
        item_text = file_as_object.get(item)  # Safely try to get the text for the current item
        
        if item_text:
            # Split the text into chunks if the item exists
            item_text_chunks = text_splitter.split_text(item_text)
            chunks.extend(item_text_chunks)  # Add the chunks to the list
            
            # If you need to perform additional processing on each chunk, you can do so here.
            for i, chunk in enumerate(item_text_chunks):
                print(f"Chunk {i} for {item}: {chunk[:100]}...")  # Preview first 100 chars
        else:
            print(f"Key '{item}' not found in {file_name}. Skipping...")
    
    return chunks  # Return the list of all chunks


In [29]:
first_file_chunks = split_form10k_data_from_file(first_file_name)

for i, chunk in enumerate(first_file_chunks):
    print(f"Chunk {i}: {chunk[:100]}...")  # Preview first 100 chars of each chunk


Processing item1 from C:/Users/SSK/Downloads/dwsample1-json.json
Key 'item1' not found in C:/Users/SSK/Downloads/dwsample1-json.json. Skipping...
Processing item1a from C:/Users/SSK/Downloads/dwsample1-json.json
Key 'item1a' not found in C:/Users/SSK/Downloads/dwsample1-json.json. Skipping...
Processing item7 from C:/Users/SSK/Downloads/dwsample1-json.json
Key 'item7' not found in C:/Users/SSK/Downloads/dwsample1-json.json. Skipping...
Processing item7a from C:/Users/SSK/Downloads/dwsample1-json.json
Key 'item7a' not found in C:/Users/SSK/Downloads/dwsample1-json.json. Skipping...


In [49]:
first_file_chunks[0]

IndexError: list index out of range

In [30]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.names = $chunkParam.names,
        mergedChunk.formId = $chunkParam.formId, 
        mergedChunk.cik = $chunkParam.cik, 
        mergedChunk.cusip6 = $chunkParam.cusip6, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.f10kItem = $chunkParam.f10kItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

In [31]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [32]:
kg.query(merge_chunk_node_query, 
         params={'chunkParam':first_file_chunks[0]})

IndexError: list index out of range

In [33]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")


Forbidden: {code: Neo.ClientError.Security.Forbidden} {message: Creating new node label on database 'neo4j' is not allowed for user 'neo4j' with roles [PUBLIC]. See GRANT CREATE NEW NODE LABEL ON DATABASE `neo4j`...}

In [34]:
kg.query("SHOW INDEXES")

Forbidden: {code: Neo.ClientError.Security.Forbidden} {message: Show indexes on database 'neo4j' is not allowed for user 'neo4j' with roles [PUBLIC].}

In [35]:
node_count = 0
for chunk in first_file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Created 0 nodes


In [36]:
kg.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)

[{'nodeCount': 0}]

In [37]:
kg.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

Forbidden: {code: Neo.ClientError.Security.Forbidden} {message: Creating new node label on database 'neo4j' is not allowed for user 'neo4j' with roles [PUBLIC]. See GRANT CREATE NEW NODE LABEL ON DATABASE `neo4j`...}

In [38]:
kg.query("SHOW INDEXES")

Forbidden: {code: Neo.ClientError.Security.Forbidden} {message: Show indexes on database 'neo4j' is not allowed for user 'neo4j' with roles [PUBLIC].}

In [63]:
kg.query("""
    MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
    WITH chunk, genai.vector.encode(
      chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

ValueError: Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Unknown function 'genai.vector.encode' (line 3, column 21 (offset: 75))
"    WITH chunk, genai.vector.encode("
                 ^}

In [57]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': OPENAI_ENDPOINT,
                      'index_name':VECTOR_INDEX_NAME, 
                      'top_k': 10})
  return similar

In [58]:
search_results = neo4j_vector_search(
    'In a single sentence, tell me about Netapp.'
)

ValueError: Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Unknown function 'genai.vector.encode' (line 2, column 6 (offset: 10))
"    WITH genai.vector.encode("
          ^}

In [59]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)


APIConnectionError: Connection error.

In [None]:
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_react_agent
from langchain.tools import Tool
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.schema import StrOutputParser
from langchain_community.chat_message_histories import Neo4jChatMessageHistory
from langchain_community.graphs import Neo4jGraph
from uuid import uuid4

SESSION_ID = str(uuid4())
print(f"Session ID: {SESSION_ID}")

llm = ChatOpenAI(openai_api_key="sk-...")

graph = Neo4jGraph(
    url="bolt://localhost:7687",
    username="neo4j",
    password="12345678"
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a Neo4j expert having a conversation about how to create Cypher queries",
        ),
        ("human", "{input}"),
    ]
)

cypher_chat = prompt | llm | StrOutputParser()

def get_memory(session_id):
    return Neo4jChatMessageHistory(session_id=session_id, graph=graph)

tools = [
    Tool.from_function(
        name="Cypher Support",
        description="For when you need to talk about Cypher queries.",
        func=cypher_chat.invoke,
    )
]

agent_prompt = hub.pull("hwchase17/react-chat")
agent = create_react_agent(llm, tools, agent_prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools)

cypher_agent = RunnableWithMessageHistory(
    agent_executor,
    get_memory,
    input_messages_key="input",
    history_messages_key="chat_history",
)

while True:
    q = input("> ")

    response = cypher_agent.invoke(
        {
            "input": q
        },
        {"configurable": {"session_id": SESSION_ID}},
    )
    
    print(response["output"])