<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/llm/openaifunction_constructing_graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain neo4j openai wikipedia tiktoken



In [2]:
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel


class Node(BaseNode):
    properties: Optional[str] = Field(None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[str] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [3]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def string_to_dict(s: str) -> dict:
    """Convert a string of properties to a dictionary."""
    items = s.split("; ")
    properties = {}
    try:
        for item in items:
            key, value = item.split(": ")
            properties[format_property_key(key)] = value.capitalize()
    except ValueError:  # GPT-3.5 doesn't always follow syntax
        pass
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = string_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = string_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [4]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""You are a world class algorithm for extracting information in structured formats to construct a knowledge graph.
          Nodes are used to represent entities and concepts similar to Wikipedia nodes.
          When labeling nodes in the knowledge graph, please ensure that you consistently use elementary or basic types for node labels.
          For instance, if you encounter an entity that represents a person, always label it as "person" rather
          than using more specific terms like "mathematician" or "scientist."
          The goal is to keep node labels at a fundamental level to maintain clarity and simplicity in the knowledge graph.
          By doing so, we ensure that the graph remains accessible and understandable for a wide range of users.
          {'Allowed node labels are:' + ", ".join(allowed_nodes) if allowed_nodes else ""}
          {'Allowed relationship types are:' + ", ".join(allowed_rels) if allowed_rels else ""}
          Whenever you encounter age information or numerical data related to entities, ensure they are incorporated
          as attributes or properties of the respective nodes.
          Do not create separate nodes for dates or numbers, but always attach them as node attributes or properties.
          The properties should be given in a key: value structure. Example:
          properties: "age: 16; height: 175cm; birthDate: 1988-05-08"
          Never use escaped single or double quotes inside of properties value.
          Use camelCase format for property keys (birthDate).
          You must follow the rules or you will be terminated.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [5]:
from langchain.graphs import Neo4jGraph

url = "neo4j+s://databases.neo4j.io"
username ="neo4j"
password = "password"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [6]:
def extract_and_store_graph(document: Document, nodes:Optional[List[str]] = None, rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.run(document.page_content)
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [7]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter

# Read the wikipedia article
raw_documents = WikipediaLoader(query="Walt Disney").load()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=384, chunk_overlap=0)

documents = text_splitter.split_documents(raw_documents[:3])

In [8]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

100%|██████████| 9/9 [04:25<00:00, 29.51s/it]


In [None]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

In [9]:
# Specify which node labels should be extracted by the LLM
allowed_nodes = ["Person", "Company", "Location", "Event", "Movie", "Service", "Award"]

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d, allowed_nodes)

100%|██████████| 9/9 [04:39<00:00, 31.02s/it]


In [12]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

In [13]:
cypher_chain.run("When was Walter Elias Disney born?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: "Walter Elias Disney"}) RETURN p.birthdate[0m
Full Context:
[32;1m[1;3m[{'p.birthdate': '1901-12-05'}][0m

[1m> Finished chain.[0m


'Walter Elias Disney was born on December 5, 1901.'