<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/llm/openaifunction_constructing_graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain neo4j openai wikipedia tiktoken



In [2]:
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel


class Node(BaseNode):
    properties: Optional[str] = Field(None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[str] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [3]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def string_to_dict(s: str) -> dict:
    """Convert a string of properties to a dictionary."""
    items = s.split("; ")
    properties = {}
    try:
        for item in items:
            key, value = item.split(": ")
            properties[format_property_key(key)] = value.capitalize()
    except ValueError:  # GPT-3.5 doesn't always follow syntax
        pass
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = string_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = string_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [4]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
          ## 1. Overview
          You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
          - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
          - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
          ## 2. Labeling Nodes
          - **Consistency**: Ensure you use basic or elementary types for node labels.
            - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
          - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
          {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
          {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
          ## 3. Handling Numerical Data and Dates
          - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
          - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
          - **Property Format**: Properties must be in a key-value format.
            - Example: `properties: "age: 16; height: 175cm; birthDate: 1988-05-08"`
          - **Quotation Marks**: Never use escaped single or double quotes within property values.
          - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
          ## 4. Strict Compliance
          Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [5]:
from langchain.graphs import Neo4jGraph

url = "neo4j+s://databases.neo4j.io"
username ="neo4j"
password = ""
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [6]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.run(document.page_content)
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [7]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter

# Read the wikipedia article
raw_documents = WikipediaLoader(query="Walt Disney").load()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)

documents = text_splitter.split_documents(raw_documents[:3])

In [8]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

100%|██████████| 6/6 [05:21<00:00, 53.58s/it]


In [9]:
# Delete the graph
#graph.query("MATCH (n) DETACH DELETE n")

In [10]:
# Specify which node labels should be extracted by the LLM
allowed_nodes = ["Person", "Company", "Location", "Event", "Movie", "Service", "Award"]

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d, allowed_nodes)

100%|██████████| 6/6 [06:00<00:00, 60.14s/it]


In [11]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

In [12]:
cypher_chain.run("When was Walter Elias Disney born?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: "Walter Elias Disney"}) RETURN p.birthdate[0m
Full Context:
[32;1m[1;3m[{'p.birthdate': '1901-12-05'}][0m

[1m> Finished chain.[0m


'Walter Elias Disney was born on December 5, 1901.'