In [1]:
# !pip install langchain neo4j openai wikipedia tiktoken langchain-openai
# from https://github.com/tomasonjo/blogs/blob/master/llm/openaifunction_constructing_graph.ipynb
from langchain.graphs import Neo4jGraph
from openai import OpenAI
from dotenv import load_dotenv
import os
import json

In [2]:
load_dotenv(dotenv_path='secrets.env')
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [3]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

In [4]:
class Property(BaseModel):
    key: str = Field(..., description="key")
    value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(None, description="node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(None, description="relationship properties")

class KnowledgeGraph(BaseModel):
    nodes: List[Node] = Field(..., description="nodes in the graph")
    rels: List[Relationship] = Field(..., description="relationships in the graph")

In [5]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return first_word + "".join(capitalized_words)

def props_to_dict(props) -> dict:
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    properties = props_to_dict(node.properties) if node.properties else {}
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [6]:
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
        allowed_nodes: Optional[List[str]] = None,
        allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
            "system",
            f"""# Knowledge Graph Instructions for GPT-4
    ## 1. Overview
    You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
    - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
    - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
    ## 2. Labeling Nodes
    - **Consistency**: Ensure you use basic or elementary types for node labels.
    - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
    - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
    {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
    {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
    ## 3. Handling Numerical Data and Dates
    - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
    - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
    - **Property Format**: Properties must be in a key-value format.
    - **Quotation Marks**: Never use escaped single or double quotes within property values.
    - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
    ## 4. Coreference Resolution
    - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
    If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
    always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
    Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
    ## 5. Strict Compliance
    Adhere to the rules strictly. Non-compliance will result in termination.
            """),
            ("human", "Use the given format to extract info from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [7]:
def extract_and_store_graph(
      document:Document,
      nodes:Optional[List[str]] = None,
      rels:Optional[List[str]]=None) -> GraphDocument:
    
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
        
    graph.add_graph_documents([graph_document])


In [8]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter

# Read the wikipedia article
raw_documents = WikipediaLoader(query="History of France").load()

# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=600, chunk_overlap=24)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents)[:10]


# Adversarial Examples

In [9]:
text = """
Event A came before Event B.
Event B came before Event C.
Event C came before Event A.
"""
documents = [Document(page_content=text)]

In [19]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

100%|██████████| 10/10 [03:24<00:00, 20.50s/it]


In [12]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

In [7]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [63]:
original_query = "What relationships have existed between France and China?"

In [50]:
query = "MATCH (n) RETURN n"
all_nodes = graph.query(query)
print(all_nodes)

[{'n': {'name': 'VIETNAM'}}, {'n': {'annexation': 'Southern Gaul by the Roman Republic in the late 2nd century BC', 'language': 'Gaulish', 'frenchRevolution': 'Monarchy and associated institutions were overthrown in the French Revolution', 'nationalHeroine': 'Joan of Arc', 'barbarianRaids': 'Subject to barbarian raids and migration, especially by the Germanic Franks', 'id': 'France', 'colonies': 'Greek, Roman, Carthaginian', 'power': 'Reached a zenith under the rule of Louis XIV', 'frenchEmpire': "Napoleon's French Empire was declared in 1804", 'monarchy': 'Transformed into a centralized absolute monarchy through the Renaissance and Reformation', 'war': 'Ended with a Valois victory in 1453', 'history': 'The first written records for the history of France appeared in the Iron Age.', 'conquest': 'Rest of Gaul by Julius Caesar in the Gallic Wars of 58–51 BC', 'unification': 'Unification under Clovis I in the late 5th century', 'kingdom': "Medieval Kingdom of France emerged from Charlemagn

In [52]:
graph.query("CREATE (n:Country {name: 'VIETNAM', id: 'Vietnam'}) RETURN n")

[{'n': {'name': 'VIETNAM', 'id': 'Vietnam'}}]

ValueError: Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'What': expected 'FOREACH', 'ALTER', 'CALL', 'USING PERIODIC COMMIT', 'CREATE', 'LOAD CSV', 'START DATABASE', 'STOP DATABASE', 'DEALLOCATE', 'DELETE', 'DENY', 'DETACH', 'DROP', 'DRYRUN', 'FINISH', 'GRANT', 'INSERT', 'MATCH', 'MERGE', 'NODETACH', 'OPTIONAL', 'REALLOCATE', 'REMOVE', 'RENAME', 'RETURN', 'REVOKE', 'ENABLE SERVER', 'SET', 'SHOW', 'TERMINATE', 'UNWIND', 'USE' or 'WITH' (line 1, column 1 (offset: 0))
"What relationships exist between France and China? Vietnam"
 ^}

In [53]:
query = "MATCH (n) RETURN n"
all_nodes = graph.query(query)
all_nodes_list = []
for record in all_nodes:
    if (record.get('n').get('name') is not None):
        all_nodes_list.append(record.get('n').get('name'))
print(all_nodes_list)

['VIETNAM', 'VIETNAM', 'France', "Napoleon'S French Empire", 'Napoleonic Wars', 'Monarchy', 'Second Republic', 'Second Empire', 'French Third Republic', 'Triple Entente', 'World War I', 'Germany', 'Central Powers', 'Allied Powers', 'World War Ii', 'Nazi Germany', 'Third Republic', 'Vichy Government', 'Liberation', 'Fourth Republic', 'Baby Boom', 'Indochina', 'Algeria', 'French Fifth Republic', 'Decolonization', 'French Colonial Empire', 'Medievalkingdomoffrance', 'Frenchrevolution', 'Napoleonicera', 'Unitedkingdom', 'Industrialization', 'Imperialism', 'Late19Thandearly20Thcenturies', 'Welfarestate', 'Europeanunion', 'Eurozone', 'Nicolas Sarkozy', 'Conservative Government', 'Kingdom Of France', 'West Francia', 'Hugh Capet', 'Philip Ii', 'French Revolution', 'Kingdom Of Navarre', 'Brittany', 'Catalonia', 'Aquitaine', 'Lorraine', 'Burgundy', 'House Of Plantagenet', 'Kingdom Of England', 'Angevin Empire', "Hundred Years' War", 'French Throne', 'Italy', 'Spain', 'Holy Roman Empire', 'Italia

In [40]:
# get edges / relationships
# use cypher to get edges
# feed into decide_continue_response
# separate context pull function
# compare the extra context to the original query

query = "MATCH (n)-[r]-(m) RETURN distinct type(r)"
all_rel_types = graph.query(query)
all_rel_types_filtered = []
for record in all_rel_types:
    # split into list by :
    rel_type = record.get('type(r)').split(':')
    print(rel_type[0].lower())
    all_rel_types_filtered.append(rel_type[0].lower())

experienced
hashistorian
hasleader
hascolonialempire
involvedin
governedas
partof
hasruler
hasterritory
hasrivalry
hasconflict
defeated_in
opposed
paved_the_way_for
ruledas
opposedto
followedby
has
sought_to_extend_influence_into
hasinfluence
hashistoricalperiod
participated_in
underwent
controlledby
establishedafter
ended
tookplacein
hadimpacton
competedwith
introducedin
integratedwith
impact
cause
originated_from
first_king
abolished_by
ruled_in_union_with
authority_barely_felt_in
part_of
ruled_by
abolished
ruled
involved_in
claimed
defeated_by
founded_by
fought_against
descendedfrom
classifiedunder
exhibits
spokenby
influencedby
hasconcept
haslanguage
writtenfor
hassequence
hasinscription
variation
translation
derivedfrom
hasword


In [27]:
# all_rel_types_filtered.append("fought in a bloody conflict with")

In [59]:
# more loose version
decide_continue_response_with_context = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {'role': 'system', 'content': 'You are a highly intelligent agent whose task is to alter original prompts to make similar prompts which are worded differently. You will be given a list of relationship types. Use synonyms, paraphrases, and other techniques to alter the original prompts.'},
        {'role': 'user', 'content': 
            'Generate 9 prompts in a numbered list with no extra newlines, using the relationship types provided. Feel free to be more creative with the original query. For example, you may replace nodes in the query with nearby nodes. The original prompt is:\n' 
            + original_query + '\n'
            + 'The available relationship types in the knowledge graph are:\n' + '\n'.join(all_rel_types_filtered)
            + 'This is a list of all nodes in the knowledge graph:\n' + '\n'.join(all_nodes_list)
        }
    ]
)

response = decide_continue_response_with_context.choices[0].message.content
print(response)

# make a list of the 9 prompts
prompts = response.split('\n')
for i in range(len(prompts)):
    prompts[i] = prompts[i][3:]

for prompt in prompts:
    print(prompt)

1. What historical periods describe the connection between France and China? Vietnam
2. How has France interacted with China through the lens of colonial empires? Vietnam
3. Discuss the conflicts involving France and China. Vietnam
4. What influence has France sought to extend into China? Vietnam
5. In what ways did France compete with China historically? Vietnam
6. Describe how France and China are historically opposed. Vietnam
7. What involvement did France have in China's history? Vietnam
8. Outline the rulers that governed France and China. Vietnam
9. What rivalries have developed between France and China? Vietnam
What historical periods describe the connection between France and China? Vietnam
How has France interacted with China through the lens of colonial empires? Vietnam
Discuss the conflicts involving France and China. Vietnam
What influence has France sought to extend into China? Vietnam
In what ways did France compete with China historically? Vietnam
Describe how France and

In [28]:
decide_continue_response_with_context = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {'role': 'system', 'content': 'You are a highly intelligent agent whose task is to alter original prompts to make similar prompts which are worded differently. You will be given a list of relationship types. Use synonyms, paraphrases, and other techniques to alter the original prompts.'},
        {'role': 'user', 'content': 
            'Generate 9 prompts in a numbered list with no extra newlines, using the relationship types provided. The original prompt is:\n' 
            + original_query + '\n'
            + 'The available relationship types in the knowledge graph are:\n' + '\n'.join(all_rel_types_filtered)
            + 'This is a list of all nodes in the knowledge graph:\n' + '\n'.join(all_nodes_list)
        }
    ]
)

response = decide_continue_response_with_context.choices[0].message.content
print(response)

1. Which nations has France engaged in warfare with?
2. What countries has France been involved in military conflicts with?
3. Identify the states that have battled against France.
4. Which territories has France opposed in wartime?
5. Name the countries that France has fought in a bloody conflict with.
6. Which regions has France experienced war with?
7. What nations has France had military confrontations with?
8. List the countries that France has opposed in combat.
9. With which countries has France participated in armed conflicts?


In [67]:
decide_continue_response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {'role': 'system', 'content': 'You are a highly intelligent agent whose task is to alter original prompts to make similar prompts which are worded differently. Use synonyms, paraphrases, and other techniques to alter the original prompts.'},
        {'role': 'user', 'content': 
            'Generate 9 prompts in a numbered list with no extra newlines. Include in some query the following context: Vietnam is a country node in the graph, and a previous colony of the French. The original prompt is:\n' 
            + original_query
        }
    ]
)

response = decide_continue_response.choices[0].message.content
# make a list of the 9 prompts
prompts = response.split('\n')
for i in range(len(prompts)):
    prompts[i] = prompts[i][3:]

for prompt in prompts:
    print(prompt)

Describe the historical interactions between France and China. 
What connections have historically linked France to China? 
Outline the past associations between France and China.
How have France and China been related throughout history?
Examine the ties that have existed between France and China over the years.
Can you explore the historical relations between France and China?
Discuss the nature of the historical relationship between France and China.
Trace the evolution of the relationship between France and China.
What have been the various forms of interactions between France and China in the past?


1. With which nations has France engaged in warfare?
2. Identify the countries that have been at war with France.
3. List the states against which France has fought.
4. Name the nations France has been in conflict with.
5. Which entities have battled against France?
6. Specify the countries France has had military confrontations with.
7. Which territories has France opposed in war?
8. Detail the nations involved in wars with France.
9. Enumerate the states that France has fought against.

In [56]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)
outputs = []
for prompt in prompts:
    try:
        res = cypher_chain.run(prompt)
    except Exception as e:
        print(f"An error occurred: {e}")
        continue
    if res is not "I don't know the answer.":
        outputs.append(res)

# merge outputs into string
context = "\n".join(outputs)

query_response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {'role': 'system', 'content': 'You are a highly intelligent agent whose task is to answer questions only based on the context you are given. Your answers must be concise and very short.'},
        {'role': 'user', 'content': 
            'The original question is: '+ original_query + '\n'
            'The given context is: ' + context + '\n'
            'Your task is to answer the original question using only the context you are given. Your answer must be concise and very short.'
        }
    ]
)

print(query_response.choices[0].message.content)


# embed query
# embed all edges 
# cosine similarity


  if res is not "I don't know the answer.":




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (france:Country {name: "France"})-[:INVOLVEDIN]->(china:Country {name: "China"})
RETURN france, china
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (france:Country {name: "France"})-[:SOUGHT_TO_EXTEND_INFLUENCE_INTO]->(china:Country {name: "China"})
RETURN france, china
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (france:Country {name: "France"})-[:HASLEADER]->(lea

In [33]:
# original query for comparison
cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

res = cypher_chain.run("Did France have any relationship with China?")
print(res)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (france:Country {name: "France"})-[:INVOLVEDIN|HASRIVALRY|HASCONFLICT|OPPOSEDTO|SOUGHT_TO_EXTEND_INFLUENCE_INTO|COMPETEDWITH|DEFEATED_BY|DEFEATED_IN|PARTICIPATED_IN|UNDERWENT|CONTROLLEDBY|GOVERNEDAS|PARTOF|HASINFLUENCE|HASHISTORIAN|HASHISTORICALPERIOD|INTEGRATEDWITH]->(china:Country {name: "China"})
RETURN france, china
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
I don't know the answer.


cypher
MATCH (france:Country {name: "France"})-[:HASCONFLICT]->(war:War)<-[:PARTICIPATED_IN]-(country:Country)
RETURN DISTINCT country.name

cypher
MATCH (france:Country {name: "France"})-[:HASCONFLICT]->(conflictCountry:Country)
RETURN conflictCountry.name

In [8]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 
def ask_gpt_if_same(node1: str, node2: str):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[    
            # System basically tells the chat gpt model how to act
            {"role": "system", "content": "You are the most intelligent comparison agent in the world. You can compare any two pieces of text and accurately tell if they refer to the same thing or not."},
            {"role": "user", "content": "Do the following two nodes mean exactly the same thing? Node 1: " + node1 + ". Node 2: " + node2 + ". Answer yes or no."}
        ]
    ) 
    return response.choices[0].message.content.strip().lower() == 'yes'

In [35]:
query = "MATCH (n) RETURN n"
result = graph.query(query)
for record in result:
    print(record)


{'n': {'annexation': 'Southern Gaul by the Roman Republic in the late 2nd century BC', 'language': 'Gaulish', 'frenchRevolution': 'Monarchy and associated institutions were overthrown in the French Revolution', 'nationalHeroine': 'Joan of Arc', 'barbarianRaids': 'Subject to barbarian raids and migration, especially by the Germanic Franks', 'id': 'France', 'colonies': 'Greek, Roman, Carthaginian', 'power': 'Reached a zenith under the rule of Louis XIV', 'frenchEmpire': "Napoleon's French Empire was declared in 1804", 'monarchy': 'Transformed into a centralized absolute monarchy through the Renaissance and Reformation', 'war': 'Ended with a Valois victory in 1453', 'history': 'The first written records for the history of France appeared in the Iron Age.', 'conquest': 'Rest of Gaul by Julius Caesar in the Gallic Wars of 58–51 BC', 'unification': 'Unification under Clovis I in the late 5th century', 'kingdom': "Medieval Kingdom of France emerged from Charlemagne's Carolingian Empire", 'rei

In [22]:
query = """
MATCH (n)-[r]->(m)
RETURN n, r, m
"""
result = graph.query(query)

def format_string(s: str) -> str:
    return s.lower().replace('_', ' ')

def get_graph_as_text(graph):
    res = ""
    for relation in graph:
        res += '"' + relation.get('r')[0].get('name') + '" ' + format_string(relation.get('r')[1]) + ' "' + relation.get('r')[2].get('name') + '"' + "\n"
    return res

graphStr = get_graph_as_text(result)
print(graphStr)


TypeError: can only concatenate str (not "NoneType") to str

In [64]:
def ask_gpt_graph_consistent(graph: str):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[    
            # System basically tells the chat gpt model how to act
            {"role": "system", "content": "You are the most intelligent consistency checker in the world. You can read any piece of text and answer whether it is logically consistent or not."},
            {"role": "user", "content": 'Check if the following text is consistent. Be concise. Concepts and nouns are surrounded in double quotes, such as "Event." The words in between two concepts are relationship identifiers that connect the two. If there are any logical errors, please list all logical errors. The text is: ' + "\n" + graph}
        ]
    ) 
    return response.choices[0].message.content

In [None]:
ask_gpt_graph_consistent(graphStr)

In [None]:
def get_all_nodes(graph):
    query = "MATCH (n) RETURN n"
    result = graph.query(query)
    return result
nodes = get_all_nodes(graph)
# print(nodes[1])
# compare each pair of nodes
for i, node in enumerate(nodes):
    for j, node2 in enumerate(nodes):
        if i != j:
            print(node)
            if (ask_gpt_if_same(node.get('n').get('name'), node2.get('n').get('name'))):
                print(f"Node {i} and Node {j} mean the same thing.")

# for i, node in enumerate(nodes):
#     print(i,node)

In [None]:
def remove_node_but_retain_edges(graph, node_id, new_node_id):
    # Reassign relationships to the new node
    query = f"""
    MATCH (n {{id: '{node_id}'}})-[r]->(m)
    CREATE (new {{id: '{new_node_id}'}})-[new_r:TYPE(r)]->(m)
    SET new_r = r
    WITH n, r
    MATCH (m)-[r2]->(n)
    CREATE (m)-[new_r2:TYPE(r2)]->(new {{id: '{new_node_id}'}})
    SET new_r2 = r2
    DELETE r, r2
    """
    graph.query(query)
    
    # Delete the original node
    delete_query = f"MATCH (n {{id: '{node_id}'}}) DELETE n"
    graph.query(delete_query)