In [1]:
# !pip install langchain neo4j openai wikipedia tiktoken langchain-openai
# from https://github.com/tomasonjo/blogs/blob/master/llm/openaifunction_constructing_graph.ipynb
from langchain.graphs import Neo4jGraph
from openai import OpenAI
from dotenv import load_dotenv
import os
import json

In [2]:
load_dotenv(dotenv_path='secrets.env')
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [5]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

In [27]:
class Node():
    def __init__(self, name, description):
        self.name = name
        self.id = os.urandom(16).hex()
        self.description = description
    
class Graph():
    # store Nodes in a hashMap: id -> Node
    # store the edges as lists in a dict: sourceId -> [(neighborId, relationship)]
    # store the edges as dicts in a dict: sourceId -> {neighborId -> (neighborNode, relationship)}

    def __init__(self):
        self.nodes = {}
        self.edgesMapToList = {}
        self.edgesMapToMap = {}
    
    def add_node(self, node: Node):
        self.nodes[node.id] = node
        self.edgesMapToList[node.id] = []
        self.edgesMapToMap[node.id] = {}

    def add_edge(self, source: Node, target: Node, relationship: str):
        if (self.nodes.get(source.id) == None):
            raise Exception("Source node not found")
        elif (self.nodes.get(target.id) == None):
            raise Exception("Target node not found")
        
        self.edgesMapToList[source.id].append((target.id, relationship))
        self.edgesMapToMap[source.id][target.id] = (target, relationship)

    def get_neighbors(self, node: Node):
        return self.edgesMapToList[node.id]
    
    def is_neighbor(self, node: Node, neighbor: Node):
        return self.edgesMapToMap[node.id].get(neighbor.id) != None
    
    def get_list_nodes(self):
        return self.nodes

In [28]:
# Test cases for above
test1 = Node("test1", "test node #1")
test2 = Node("test2", "test node #2")
test3 = Node("test3", "test node #3")
graph = Graph()
graph.add_node(test1)
graph.add_node(test2)
graph.add_node(test3)
graph.add_edge(test1, test2, "dislikes")
print(graph.is_neighbor(test1, test2));
print(graph.is_neighbor(test2, test3));
print(graph.is_neighbor(test2, test1));

True
False
False


In [23]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return first_word + "".join(capitalized_words)

def props_to_dict(props) -> dict:
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    properties = props_to_dict(node.properties) if node.properties else {}
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [8]:
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
        allowed_nodes: Optional[List[str]] = None,
        allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
            "system",
            f"""# Knowledge Graph Instructions for GPT-4
    ## 1. Overview
    You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
    - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
    - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
    ## 2. Labeling Nodes
    - **Consistency**: Ensure you use basic or elementary types for node labels.
    - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
    - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
    {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
    {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
    ## 3. Handling Numerical Data and Dates
    - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
    - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
    - **Property Format**: Properties must be in a key-value format.
    - **Quotation Marks**: Never use escaped single or double quotes within property values.
    - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
    ## 4. Coreference Resolution
    - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
    If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
    always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
    Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
    ## 5. Strict Compliance
    Adhere to the rules strictly. Non-compliance will result in termination.
            """),
            ("human", "Use the given format to extract info from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [9]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [10]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter

# Read the wikipedia article
raw_documents = WikipediaLoader(query="History of France").load()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents[:3])

In [None]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

In [2]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

NameError: name 'graph' is not defined

In [12]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)
cypher_chain.run("Was France allied with the Axis powers?")

  warn_deprecated(




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (c:Country {name: "France"})-[:HASALLIANCE]->(a:Alliance {name: "Axis Powers"}) RETURN c, a[0m
Full Context:
[32;1m[1;3m[{'c': {"hundredYears'War": "A succession crisis in 1328 led to the Hundred Years' War between the House of Valois and the House of Plantagenet. The war began in 1337 following Philip VI's attempt to seize the Duchy of Aquitaine from its hereditary holder, Edward III of England, the Plantagenet claimant to the French throne. Despite early Plantagenet victories, fortunes turned in favor of the Valois. A notable figure of the war was Joan of Arc, a French peasant girl who led forces against the English, establishing herself as a national heroine. The war ended with a Valois victory in 1453.", 'frenchRevolution': "In the late 18th century the monarchy and associated institutions were overthrown in the French Revolution. The Revolutionary Tribunal executed political opponents by g

'No, France was not allied with the Axis Powers.'

In [28]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 
def ask_gpt_if_same(node1: str, node2: str):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[    
            # System basically tells the chat gpt model how to act
            {"role": "system", "content": "You are the most intelligent comparison agent in the world. You can compare any two pieces of text and accurately tell if they refer to the same thing or not."},
            {"role": "user", "content": "Do the following two nodes mean exactly the same thing? Node 1: " + node1 + ". Node 2: " + node2 + ". Answer yes or no."}
        ]
    ) 
    return response.choices[0].message.content.strip().lower() == 'yes'

In [None]:
def get_all_nodes(graph):
    query = "MATCH (n) RETURN n"
    result = graph.query(query)
    return result
nodes = get_all_nodes(graph)
# print(nodes[1])
# compare each pair of nodes
for i, node in enumerate(nodes):
    for j, node2 in enumerate(nodes):
        if i != j:
            print(node)
            if (ask_gpt_if_same(node.get('n').get('name'), node2.get('n').get('name'))):
                print(f"Node {i} and Node {j} mean the same thing.")

# for i, node in enumerate(nodes):
#     print(i,node)

In [None]:
def remove_node_but_retain_edges(graph, node_id, new_node_id):
    # Reassign relationships to the new node
    query = f"""
    MATCH (n {{id: '{node_id}'}})-[r]->(m)
    CREATE (new {{id: '{new_node_id}'}})-[new_r:TYPE(r)]->(m)
    SET new_r = r
    WITH n, r
    MATCH (m)-[r2]->(n)
    CREATE (m)-[new_r2:TYPE(r2)]->(new {{id: '{new_node_id}'}})
    SET new_r2 = r2
    DELETE r, r2
    """
    graph.query(query)
    
    # Delete the original node
    delete_query = f"MATCH (n {{id: '{node_id}'}}) DELETE n"
    graph.query(delete_query)