In [214]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [215]:
'''
!pip install python-dotenv
!pip install neo4j
!pip install pandas
!pip install langchain
!pip install transformers
!pip install langchain.chains
'''

# Note: will do a requirements.txt file later

'\n!pip install python-dotenv\n!pip install neo4j\n!pip install pandas\n!pip install langchain\n!pip install transformers\n!pip install langchain.chains\n'

In [216]:
from dotenv import load_dotenv
import os
from neo4j import GraphDatabase
import pandas as pd
from xlsx_to_csv import *
import warnings
warnings.filterwarnings("ignore")

load_dotenv()

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

URI = NEO4J_URI
AUTH = (NEO4J_USERNAME, NEO4J_PASSWORD)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

In [217]:
from kg import *

create_kg()

Knowledge Graph created with 33 nodes and 96 relationships.


In [218]:
from langchain_community.graphs import Neo4jGraph

graph = Neo4jGraph()

graph.refresh_schema()

print(graph.schema)

Node properties:
Person {name: STRING, embedding: LIST}
Relationship properties:

The relationships:
(:Person)-[:TEAMMATE]->(:Person)
(:Person)-[:FRIEND]->(:Person)
(:Person)-[:CLASSMATE]->(:Person)
(:Person)-[:FAMILY]->(:Person)
(:Person)-[:IMMEDIATE_FAMILY]->(:Person)
(:Person)-[:LOVER]->(:Person)
(:Person)-[:OPPONENT]->(:Person)
(:Person)-[:ENEMY]->(:Person)
(:Person)-[:TEACHER]->(:Person)


### Straightforward Querying

In [219]:
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True)
response = chain.invoke({'query': 'List all the people who are not enemies with Harry'})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person) WHERE NOT (p)-[:ENEMY]->(:Person {name: 'Harry'}) RETURN p.name[0m
Full Context:
[32;1m[1;3m[{'p.name': 'Jordan'}, {'p.name': 'Piers'}, {'p.name': 'Harry'}, {'p.name': 'James'}, {'p.name': 'Lily'}, {'p.name': 'Petunia'}, {'p.name': 'Vernon Dursley'}, {'p.name': 'Dudley'}, {'p.name': 'Dumbledore'}, {'p.name': 'McGonagall'}][0m

[1m> Finished chain.[0m


{'query': 'List all the people who are not enemies with Harry',
 'result': 'The people who are not enemies with Harry are Jordan, Piers, James, Lily, Petunia, Vernon Dursley, Dudley, Dumbledore, and McGonagall.'}

In [220]:
response = chain.invoke({'query': 'Harry non enemies'})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: 'Harry'})-[:ENEMY]->(enemy:Person)
WITH collect(enemy.name) as enemies
MATCH (non_enemy:Person)
WHERE NOT non_enemy.name IN enemies
RETURN non_enemy.name[0m
Full Context:
[32;1m[1;3m[{'non_enemy.name': 'Jordan'}, {'non_enemy.name': 'Piers'}, {'non_enemy.name': 'Harry'}, {'non_enemy.name': 'James'}, {'non_enemy.name': 'Lily'}, {'non_enemy.name': 'Petunia'}, {'non_enemy.name': 'Vernon Dursley'}, {'non_enemy.name': 'Dudley'}, {'non_enemy.name': 'Dumbledore'}, {'non_enemy.name': 'McGonagall'}][0m

[1m> Finished chain.[0m


{'query': 'Harry non enemies',
 'result': 'Jordan, Piers, Harry, James, Lily, Petunia, Vernon Dursley, Dudley, Dumbledore, and McGonagall are not enemies of Harry.'}

### Setting Embeddings as a property

In [221]:
def get_all_chars(tx):
    result = tx.run("MATCH (n) RETURN n")  
    return [record["n"] for record in result]

def set_node_embedding(tx, node_name, embedding):
    query = """
    MERGE (n {name: $node_name})  
    SET n.embedding = $embedding  
    RETURN n
    """
    result = tx.run(query, node_name=node_name, embedding=embedding)
    return result.single()[0]  

    
with driver.session() as session:
    nodes = session.write_transaction(get_all_chars)
    
    for node in nodes:
        session.write_transaction(set_node_embedding, node['name'], node['embedding'])



### Querying with Embeddings

In [222]:
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import OpenAIEmbeddings

In [223]:
embeddings = OpenAIEmbeddings()

vec_db = Neo4jVector.from_existing_graph(
    embedding=embeddings,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="person_index",
    node_label="Person",
    text_node_properties=["name"],
    embedding_node_property="embedding",
)



Failed to write data to connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))


In [224]:
vec_db.similarity_search(query='Harry\'s lover not himself')

[Document(page_content='\nname: Harry'),
 Document(page_content='\nname: Dumbledore'),
 Document(page_content='\nname: Malfoy'),
 Document(page_content='\nname: Voldemort')]

##### Embedding Examples

In [225]:
# vec_db.add_embeddings
print(vec_db.similarity_search_with_relevance_scores(query='Yosen\'s friends'))
print(vec_db.similarity_search_with_relevance_scores(query='Harry\'s relationship with Ron'))

[(Document(page_content='\nname: Lily'), 0.8902618885040283), (Document(page_content='\nname: Jordan'), 0.8895609378814697), (Document(page_content='\nname: Dudley'), 0.888965904712677), (Document(page_content='\nname: Ron'), 0.8885157108306885)]
[(Document(page_content='\nname: Harry'), 0.9426236748695374), (Document(page_content='\nname: Ron'), 0.9383189082145691), (Document(page_content='\nname: Hermione'), 0.9353052377700806), (Document(page_content='\nname: Malfoy'), 0.9282164573669434)]


In [226]:
res = vec_db.similarity_search(query='Harry\'s friends')

[doc for doc in res]

res[0].page_content.split(': ')[1]


'Harry'

In [227]:
vec_db.embedding.embed_query('Harry')[:3]

[-0.009504653513431549, -0.003827159060165286, -0.004058450926095247]

In [228]:
nodes[1]['name'], nodes[1]['embedding'][:3]

('Piers', [0.0015151082770898938, -0.034012846648693085, 0.006763292010873556])

In [229]:
response = vec_db.similarity_search(
    "Harry\'s friends?"
)
print(response[1].page_content)


name: Dumbledore


In [230]:
vector_response = vec_db.similarity_search('People who are on good terms with Harry')
vector_people = [res.page_content.split(': ')[1] for res in vector_response]
vector_people

['Harry', 'Hagrid', 'Dumbledore', 'Hermione']

In [231]:
vector_response = vec_db.similarity_search('Harry\'s best friends')
vector_people = [res.page_content.split(': ')[1] for res in vector_response]
vector_people

['Harry', 'Dumbledore', 'Hermione', 'Malfoy']

In [232]:
index_name = "person_index"  

store = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=index_name,
    text_node_property='name'
)

Failed to write data to connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))


In [233]:
query = 'Harry\'s 3 best friends'
query_vec = embeddings.embed_query(query)
store.similarity_search(query, k=4)

[Document(page_content='Harry'),
 Document(page_content='Dumbledore'),
 Document(page_content='Hermione'),
 Document(page_content='Hagrid')]

In [234]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain.schema import Document

def custom_kg_retriever(query_embedding):
    results = store.query_by_embedding(query_embedding)

    return [
        Document(page_content=result['name'], metadata=result)
        for result in results
    ]
    
def custom_neo4j_retriever(query_embedding):
    cypher_query = """
    WITH $query_embedding AS query_embedding
    MATCH (p:Person)
    WITH p, apoc.algo.cosineSimilarity(p.embedding, query_embedding) AS similarity
    RETURN p.name AS name, p.embedding AS embedding, similarity
    ORDER BY similarity DESC
    LIMIT 5
    """
    
    with driver.session() as session:
        result = session.run(cypher_query, {"query_embedding": query_embedding})

    # Convert results into LangChain Document objects
    documents = []
    for record in result:
        documents.append(Document(page_content=record['name'], metadata={
            "name": record['name'],
            "similarity": record['similarity'],
            "embedding": record['embedding']
        }))
    
    return documents



In [235]:
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo")

llm_transformer = LLMGraphTransformer(llm=llm)

text = """
Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
"""
documents = [Document(page_content=text)]
graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")
print(type(graph_documents))

Nodes:[Node(id='Marie Curie', type='Person'), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization'), Node(id='Nobel Prize', type='Award')]
Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Nobel Prize', type='Award'), type='WINNER'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='PROFESSOR'), Relationship(source=Node(id='Pierre Curie', type='Person'), target=Node(id='Nobel Prize', type='Award'), type='WINNER')]
<class 'list'>


In [236]:
from openai import OpenAI

In [244]:
def generate_creative_sentences(base_sentence):
    """
    Use OpenAI's API to generate creative variations of a base sentence.
    
    :param base_sentence: The input sentence to rephrase.
    :return: A creatively rephrased sentence.
    """
    prompt = f"Rephrase the following sentence in an concise manner:\n'{base_sentence}'"
    
    client = OpenAI(
        api_key=OPENAI_API_KEY,
    )

    completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-4",
    )

    return completion.choices[0].message.content

def get_query_embedding(question):
    """Convert the input question to an embedding using OpenAI."""
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)
    embedding = embedding_model.embed_query(question)
    return embedding

def vector_search_natural_language(question, top_k=3):
    """
    Perform a vector search in Neo4j based on the input question and return results in natural language.
    
    :param question: The input question as a string.
    :param top_k: Number of top results to return based on similarity.
    :return: Natural language description of the most similar nodes.
    """
    query_vector = get_query_embedding(question)
    
    with driver.session() as session:
        # Cypher query to perform vector similarity search using GDS
        query = """
        WITH $query_vector AS queryVec
        CALL gds.knn.stream(
            'book1',  
            {
                nodeProperties: ['embedding'],
                topK: $top_k,                       
                similarityCutoff: 0.9
            }
        )
        YIELD node1, node2, similarity
        WITH gds.util.asNode(node1) AS node1, gds.util.asNode(node2) AS node2, similarity
        WHERE node1.name = 'Harry' 
        RETURN node2.name AS Node, similarity
        ORDER BY similarity DESC
        """
        
        # Set up parameters
        params = {
            "query_vector": query_vector,
            "top_k": top_k,
            "nodeProjection": "Person",  
            "nodeProperties": "embedding",  
        }
        
        # Run the query with the input parameters and retrieve top K similar nodes
        result = session.run(query, params)
        
        # Parse the results and return them in natural language
        responses = []

        
        for record in result:
            node2 = record["Node"]  
            
            base_sentence = f"Harry and {node2} are similar based on the query {question}."
            
            creative_sentences = generate_creative_sentences(base_sentence)
            
            responses.append(creative_sentences)
        
        return generate_creative_sentences("\n".join(responses))



In [245]:
question = "Who are the allies of Harry?" 
response = vector_search_natural_language(question)
print(response)


"In response to the question 'Who are Harry's allies?', Harry shares a similar relationship with Hermione as they are allies. 
In relation to the question, "Who are Harry's allies?", there's a similarity between Harry and James as they are allies.
Pertaining to the question 'Who are Harry's allies?', Harry and Ron bear a similarity by being allies."


In [246]:
query = "Who are Harry's best friends?"
query_embedding = embeddings.embed_query(query)
custom_retriever = custom_neo4j_retriever(query_embedding=query_embedding)

chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), chain_type="stuff", retriever=custom_retriever
)

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Unknown function 'apoc.algo.cosineSimilarity' (line 4, column 13 (offset: 79))
"    WITH p, apoc.algo.cosineSimilarity(p.embedding, query_embedding) AS similarity"
             ^}

In [None]:
query = "Who are Harry's best friends?"
result = chain.invoke(query)

print(result)

In [None]:
def custom_retriever(query):

    results = store.as_retriever().get_relevant_documents(query)

    return [
        Document(page_content=result.metadata['name'], metadata=result.metadata)
        for result in results
    ]
    
query = "Who are Harry's best friends?"
custom_retriever(query)



ValueError: Make sure that none of the `text` properties on nodes with label `Person` are missing or empty

### To work on

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI

In [None]:
retriever = vec_db.as_retriever()
retriever.invoke(vec_db)

In [None]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=vec_db.as_retriever()
)

query = "Who is Harry\'s closest friend?"

chain.invoke(
    {"question": query},
    return_only_outputs=True,
)

ValueError: Document prompt requires documents to have metadata variables: ['source']. Received document with missing metadata: ['source'].

In [None]:
retriever = store.as_retriever()

NameError: name 'store' is not defined

### Backup Functions

In [None]:
def query_knowledge_graph(query):
    with driver.session() as session:
        result = session.run(
            """
            MATCH (n:Person)-[r]->(m:Person)
            WHERE n.name CONTAINS $query OR m.name CONTAINS $query
            RETURN n.name, r, m.name
            """,
            query=query
        )
        return result.data()


In [None]:
from openai import OpenAI
import os

client = OpenAI(
    api_key = os.getenv('OPENAI_API_KEY')
)
    
models = client.models.list()

models

SyncPage[Model](data=[Model(id='gpt-4-1106-preview', created=1698957206, object='model', owned_by='system'), Model(id='text-embedding-3-small', created=1705948997, object='model', owned_by='system'), Model(id='tts-1-1106', created=1699053241, object='model', owned_by='system'), Model(id='dall-e-2', created=1698798177, object='model', owned_by='system'), Model(id='tts-1', created=1681940951, object='model', owned_by='openai-internal'), Model(id='tts-1-hd-1106', created=1699053533, object='model', owned_by='system'), Model(id='tts-1-hd', created=1699046015, object='model', owned_by='system'), Model(id='dall-e-3', created=1698785189, object='model', owned_by='system'), Model(id='whisper-1', created=1677532384, object='model', owned_by='openai-internal'), Model(id='gpt-4-turbo', created=1712361441, object='model', owned_by='system'), Model(id='gpt-4-turbo-2024-04-09', created=1712601677, object='model', owned_by='system'), Model(id='gpt-4-0125-preview', created=1706037612, object='model', 