In [1]:
!pip install --upgrade --quiet neo4j langchain-experimental langchain_community langchain langchain-google-vertexai

In [2]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "credentials.json"

In [3]:
from typing import Any, List, Optional, Sequence

from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain_core.documents import Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

system_prompt = (
    "# Knowledge Graph Instructions for GPT-4\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph.\n"
    "Try to capture as much information from the text as possible without "
    "sacrifing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "accessible for a vast audience.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'"
    "  - **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination."
)

default_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "human",
            (
                "Tip: Make sure to answer in the correct format and do "
                "not include any explanations. "
                "Use the given format to extract information from the "
                "following input: {input}"
            ),
        ),
    ]
)


def optional_enum_field(
    enum_values: Optional[List[str]] = None,
    description: str = "",
    is_rel: bool = False,
    **field_kwargs: Any,
) -> Any:
    """Utility function to conditionally create a field with an enum constraint."""
    if enum_values:
        return Field(
            ...,
            enum=enum_values,
            description=f"{description}. Available options are {enum_values}",
            **field_kwargs,
        )
    else:
        node_info = (
            "Ensure you use basic or elementary types for node labels.\n"
            "For example, when you identify an entity representing a person, "
            "always label it as **'Person'**. Avoid using more specific terms "
            "like 'Mathematician' or 'Scientist'"
        )
        rel_info = (
            "Instead of using specific and momentary types such as "
            "'BECAME_PROFESSOR', use more general and timeless relationship types like "
            "'PROFESSOR'. However, do not sacrifice any accuracy for generality"
        )
        additional_info = rel_info if is_rel else node_info
        return Field(..., description=description + additional_info, **field_kwargs)


def create_simple_model(
    node_labels: Optional[List[str]] = None, rel_types: Optional[List[str]] = None
) -> Any:
    """
    Simple model allows to limit node and/or relationship types.
    Doesn't have any node or relationship properties.
    """
    class SimpleRelationship(BaseModel):
        """Represents a directed relationship between two nodes in a graph."""
        start_node_id: str = optional_enum_field(
            node_labels, description="The type or label of the start node."
        )
        start_node_type: str = Field(description="Mandatory type of the start node of the relationship.")
        end_node_id: str = Field(description="Name or human-readable unique identifier of end node")
        end_node_type: str = optional_enum_field(
            node_labels, description="The type or label of the end node."
        )
        type: str = optional_enum_field(
            rel_types, description="The type of the relationship.", is_rel=True
        )

    class DynamicGraph(BaseModel):
        """Represents a graph document consisting of nodes and relationships."""

        relationships: Optional[List[SimpleRelationship]] = Field(
            description="List of relationships"
        )

    return DynamicGraph


In [4]:
from langchain.pydantic_v1 import BaseModel
from langchain_google_vertexai import ChatVertexAI
from langchain_core.prompts import ChatPromptTemplate
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field

schema = create_simple_model()

llm = ChatVertexAI(model_name="gemini-pro", temperature=0, convert_system_message_to_human=True)
llm_transformer = llm.with_structured_output(schema)

chain = default_prompt | llm_transformer

text ="""
Maria Salomea Skłodowska-Curie[a] (Polish: [ˈmarja salɔˈmɛa skwɔˈdɔfska kʲiˈri] ⓘ; née Skłodowska; 7 November 1867 – 4 July 1934), known simply as Marie Curie (/ˈkjʊəri/ KURE-ee,[1] French: [maʁi kyʁi]), was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields. Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes. She was, in 1906, the first woman to become a professor at the University of Paris.[2]
"""

llm_output = chain.invoke({"input": text})

  from pandas.core import (


In [5]:
llm_output

DynamicGraph(relationships=[SimpleRelationship(start_node_id='Maria Salomea Skłodowska-Curie', start_node_type='Person', end_node_id='Nobel Prize', end_node_type='Award', type='WINNER'), SimpleRelationship(start_node_id='Maria Salomea Skłodowska-Curie', start_node_type='Person', end_node_id='University of Paris', end_node_type='Organization', type='PROFESSOR'), SimpleRelationship(start_node_id='Maria Salomea Skłodowska-Curie', start_node_type='Person', end_node_id='Pierre Curie', end_node_type='Person', type='SPOUSE'), SimpleRelationship(start_node_id='Pierre Curie', start_node_type='Person', end_node_id='Nobel Prize', end_node_type='Award', type='WINNER')])

In [6]:
def map_to_base_relationship(rel: Any) -> Relationship:
    """Map the SimpleRelationship to the base Relationship."""
    source = Node(id=rel.start_node_id.title(), type=rel.start_node_type.capitalize())
    target = Node(id=rel.end_node_id.title(), type=rel.end_node_type.capitalize())
    return Relationship(
        source=source, target=target, type=rel.type.replace(" ", "_").upper()
    )

relationships = (
    [map_to_base_relationship(rel) for rel in llm_output.relationships]
    if llm_output.relationships
    else []
)

graph_documents = [GraphDocument(nodes=[], relationships=relationships, source=Document(page_content=text))]

In [7]:
import os
from langchain_community.graphs import Neo4jGraph

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "password"

graph = Neo4jGraph()

In [8]:
graph.add_graph_documents(graph_documents)