In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate


# Basic model

* enum for node labels and rel types
* no node or rel properties
* most openai models are terrible at following enum, so we might introduce optional postfiltering
* not limiting rel types between specific node labels

In [3]:
import os

os.environ["NEO4J_URI"] = 'neo4j+s://demo.neo4jlabs.com'
os.environ["NEO4J_USERNAME"] = 'recommendations'
os.environ["NEO4J_PASSWORD"] = 'recommendations'
os.environ["NEO4J_DATABASE"] = 'recommendations'
os.environ["OPENAI_API_KEY"] = 'sk-'

graph = Neo4jGraph()

In [4]:
llm=ChatOpenAI(temperature=0, model="gpt-4-0125-preview")

prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          """# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use available types for node labels.
  - If there are provided available values for node labels, use only those and nothing else. If the entity doesn't fit any of the included labels, do not return the entity!
  - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format and do not include any "),
        ])
 

In [5]:
from typing import List, Union, Optional

from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, Field

def optional_enum_field(enum_values: Optional[List[str]] = None, **field_kwargs):
    """Utility function to conditionally create a field with an enum constraint."""
    if enum_values:
        return Field(..., enum=enum_values, **field_kwargs)
    else:
        return Field(..., **field_kwargs)

def create_simple_model(node_labels: Optional[List[str]] = None, rel_types:Optional[List[str]] = None) -> BaseModel:
    """
    Simple model allows to limit node and/or relationship types.
    Doesn't have any node or relationship properties.
    """
    class Node(BaseModel):
        """Represents a node in a graph with associated properties.
        """
        id: str = Field(description="A unique identifier for the node.")
        type: str = optional_enum_field(node_labels, description="The type or label of the node.")
    
    class Relationship(BaseModel):
        """Represents a directed relationship between two nodes in a graph.
        """
    
        source: Node = Field(description="The source node of the relationship.")
        target: Node = Field(description="The target node of the relationship.")
        type: str = optional_enum_field(rel_types, description="The type of the relationship.")
    
    class Graph(BaseModel):
        """Represents a graph document consisting of nodes and relationships.
        """
    
        nodes: Optional[List[Node]] = Field(description="List of nodes")
        relationships: Optional[List[Relationship]] = Field(description="List of relationships")

    return Graph

In [6]:
schema = create_simple_model(["Business", "Individual"], ["SUES"])

structured_llm = llm.with_structured_output(schema)
chain = prompt | structured_llm

  warn_beta(


In [7]:
chain.invoke({"input": "Elon Musk is suing OpenAI"})

Graph(nodes=[Node(id='Elon Musk', type='Individual'), Node(id='OpenAI', type='Business')], relationships=[Relationship(source=Node(id='Elon Musk', type='Individual'), target=Node(id='OpenAI', type='Business'), type='SUES')])

In [8]:
schema = create_simple_model()

structured_llm = llm.with_structured_output(schema)
chain = prompt | structured_llm

In [9]:
chain.invoke({"input": "Elon Musk is suing OpenAI"})

Graph(nodes=[Node(id='Elon Musk', type='Person'), Node(id='OpenAI', type='Organization')], relationships=[Relationship(source=Node(id='Elon Musk', type='Person'), target=Node(id='OpenAI', type='Organization'), type='is suing')])

# Extended

* Each node label is a separate class, so that we can define props per node label
* Split into two LLM calls (one for nodes, and one for rels)
* not limiting rel types between specific node labels

In [10]:
node_schema = [{k:graph.structured_schema["node_props"][k]} for k in graph.structured_schema["node_props"] if k not in ["Actor", "Director", "_Bloom_Perspective_", "_Bloom_Scene_"]]
node_schema

[{'Movie': [{'property': 'url', 'type': 'STRING'},
   {'property': 'runtime', 'type': 'INTEGER'},
   {'property': 'revenue', 'type': 'INTEGER'},
   {'property': 'plotEmbedding', 'type': 'LIST'},
   {'property': 'posterEmbedding', 'type': 'LIST'},
   {'property': 'imdbRating', 'type': 'FLOAT'},
   {'property': 'released', 'type': 'STRING'},
   {'property': 'countries', 'type': 'LIST'},
   {'property': 'languages', 'type': 'LIST'},
   {'property': 'plot', 'type': 'STRING'},
   {'property': 'imdbVotes', 'type': 'INTEGER'},
   {'property': 'imdbId', 'type': 'STRING'},
   {'property': 'year', 'type': 'INTEGER'},
   {'property': 'poster', 'type': 'STRING'},
   {'property': 'movieId', 'type': 'STRING'},
   {'property': 'tmdbId', 'type': 'STRING'},
   {'property': 'title', 'type': 'STRING'},
   {'property': 'budget', 'type': 'INTEGER'}]},
 {'Genre': [{'property': 'name', 'type': 'STRING'}]},
 {'User': [{'property': 'userId', 'type': 'STRING'},
   {'property': 'name', 'type': 'STRING'}]},
 {'Pe

In [11]:
from typing import Any, Dict, List, Optional, Type
from langchain_core.pydantic_v1 import BaseModel, create_model


def json_to_pydantic(json_data: List[Dict[str, List[Dict[str, str]]]]) -> Type[BaseModel]:
    type_mapping = {
        'STRING': (Optional[str], None),
        'INTEGER': (Optional[int], None),
        'FLOAT': (Optional[float], None),
        'LIST': (Optional[List[str]], None),
        'DATE': (Optional[str], None)
    }

    models = {}

    # Generate individual Pydantic models for each class
    for entity in json_data:
        for class_name, properties in entity.items():
            fields = {'id': (Optional[str], None)}
            fields.update({prop['property']: type_mapping[prop['type']] for prop in properties})
            model = create_model(class_name, **fields)
            models[class_name] = model

    # Generate a Nodes class containing lists of the above models as attributes
    nodes_fields = {model_name: (Optional[List[model]], None) for model_name, model in models.items()}
    Nodes = create_model('Nodes', **nodes_fields)

    return Nodes

In [12]:
node_schema_pydantic = json_to_pydantic(node_schema)

In [13]:
structured_llm_nodes = llm.with_structured_output(node_schema_pydantic)
nodes_chain = prompt | structured_llm_nodes

In [14]:
text = """
Dune (titled onscreen as Dune: Part One) is a 2021 American epic science fiction film directed and co-produced by Denis Villeneuve, who co-wrote the screenplay with Jon Spaihts and Eric Roth. It is the first of a two-part adaptation of the 1965 novel of the same name by Frank Herbert. Set in the distant future, the film follows Paul Atreides as his family, the noble House Atreides, is thrust into a war for the deadly and inhospitable desert planet Arrakis. The ensemble cast includes Timothée Chalamet, Rebecca Ferguson, Oscar Isaac, Josh Brolin, Stellan Skarsgård, Dave Bautista, Stephen McKinley Henderson, Zendaya, Chang Chen, Sharon Duncan-Brewster, Charlotte Rampling, Jason Momoa, and Javier Bardem.
"""

nodes_chain.invoke({"input":text})

Nodes(Movie=[Movie(id='Dune', url=None, runtime=None, revenue=None, plotEmbedding=None, posterEmbedding=None, imdbRating=None, released=None, countries=['United States'], languages=['English'], plot='Set in the distant future, the film follows Paul Atreides as his family, the noble House Atreides, is thrust into a war for the deadly and inhospitable desert planet Arrakis.', imdbVotes=None, imdbId=None, year=2021, poster=None, movieId=None, tmdbId=None, title=None, budget=None)], Genre=None, User=None, Person=[Person(id='Denis Villeneuve', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Denis Villeneuve', poster=None, tmdbId=None), Person(id='Jon Spaihts', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Jon Spaihts', poster=None, tmdbId=None), Person(id='Eric Roth', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Eric Roth', poster=None, tmdbId=None), Person(id='Frank Herbert', url=None, bornIn=None, bio=None,

### Extract rels as step 2 (simple)

* Here we hope that id's will be identical to the step 1 (no guarantees)
* No rel props

In [15]:
def extract_relsonly_model(node_labels: Optional[List[str]] = None, rel_types:Optional[List[str]] = None) -> BaseModel:
    """
    Simple model allows to limit node and/or relationship types.
    Doesn't have any node or relationship properties.
    """
    class Node(BaseModel):
        """Represents a node in a graph with associated properties.
        """
        id: str = Field(description="A unique identifier for the node.")
        type: str = optional_enum_field(node_labels, description="The type or label of the node.")
    
    class Relationship(BaseModel):
        """Represents a directed relationship between two nodes in a graph.
        """
    
        source: Node = Field(description="The source node of the relationship.")
        target: Node = Field(description="The target node of the relationship.")
        type: str = optional_enum_field(rel_types, description="The type of the relationship.")
    
    class Graph(BaseModel):
        """Represents a graph document consisting of nodes and relationships.
        """
        relationships: Optional[List[Relationship]] = Field(description="List of relationships")

    return Graph

In [16]:
node_labels = node_schema = [k for k in graph.structured_schema["node_props"] if k not in ["Actor", "Director", "_Bloom_Perspective_", "_Bloom_Scene_"]]
rel_types = [k["type"] for k in graph.structured_schema["relationships"] if k not in ["_Bloom_HAS_SCENE_"]]
rel_schema_pydantic = extract_relsonly_model(node_labels, rel_types)

structured_llm_rels = llm.with_structured_output(rel_schema_pydantic)
rel_chain = prompt | structured_llm_rels

In [17]:
rel_chain.invoke({"input": text})

Graph(relationships=[Relationship(source=Node(id='Dune', type='Movie'), target=Node(id='Science Fiction', type='Genre'), type='IN_GENRE'), Relationship(source=Node(id='Denis Villeneuve', type='Person'), target=Node(id='Dune', type='Movie'), type='DIRECTED'), Relationship(source=Node(id='Denis Villeneuve', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Jon Spaihts', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Eric Roth', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Frank Herbert', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Timothée Chalamet', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Rebecca Ferguson', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Oscar 

### Extract rels as step 2 (extended)

* Here we hope that id's will be identical to the step 1 (no guarantees)
* Each rel is a separate class, so we can add props and also between which node labels it appears
* Duplicates for multilabeled nodes

In [18]:
def create_relationships_class(input_data: Dict[str, Any]) -> Type[BaseModel]:
    rel_props = input_data['rel_props']
    relationships = input_data['relationships']

    # Create node classes with a required id property
    node_classes = {f"{node_name}Node": create_model(f"{node_name}Node", id=(str, ...)) 
                    for rel in relationships for node_name in [rel['start'], rel['end']]}

    # Initialize relationship models referencing node classes
    relationship_models = {}
    for rel in relationships:
        rel_type=rel["type"]
        # Initialize fields with optional source and target references
        fields = {'source': (Any, ...), 'target': (Any, ...)}
        
        # Add optional properties for this relationship type if they exist
        additional_props = rel_props.get(rel_type, [])
        for prop in additional_props:
            prop_type = prop['type']
            # Make all additional properties optional
            fields[prop['property']] = (Optional[str], None) if prop_type == 'STRING' else \
                                       (Optional[int], None) if prop_type == 'INTEGER' else \
                                       (Optional[float], None) if prop_type == 'FLOAT' else \
                                       (Optional[List[Any]], None)

        # Define source and target types based on the first occurrence in relationships
        for relationship in relationships:
            if relationship['type'] == rel_type:
                source_class = node_classes[f"{relationship['start']}Node"]
                target_class = node_classes[f"{relationship['end']}Node"]
                fields['source'] = (source_class, ...)
                fields['target'] = (target_class, ...)
                break  # Assuming one example is enough to define the model
        # In order to handle relationship with the same type between different node labels, we need to add source and target info to class name
        model_name = rel_type + rel["start"] + rel["end"]
        relationship_models[model_name] = create_model(model_name, **fields)

    # Define fields for the Relationship class containing lists of each relationship model
    relationship_fields = {rel_type: (List[model], []) for rel_type, model in relationship_models.items()}
    Relationship = create_model('Relationship', **relationship_fields)

    return Relationship

In [19]:
rel_extended_schema = create_relationships_class(graph.structured_schema)



In [20]:
structured_llm_rels_extended = llm.with_structured_output(rel_extended_schema)
rel_chain_extended = prompt | structured_llm_rels_extended

In [21]:
rel_chain_extended.invoke({"input": text})

Relationship(IN_GENREMovieGenre=[IN_GENREMovieGenre(source=MovieNode(id='Dune'), target=GenreNode(id='scienceFiction'))], RATEDUserMovie=[], ACTED_INActorMovie=[], DIRECTEDActorMovie=[], DIRECTEDDirectorMovie=[], ACTED_INDirectorMovie=[], ACTED_INPersonMovie=[ACTED_INPersonMovie(source=ActorNode(id='Timothée Chalamet'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Rebecca Ferguson'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Oscar Isaac'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Josh Brolin'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Stellan Skarsgård'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Dave Bautista'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Stephen McKinley Henderson'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=

In [25]:
# Combining should solve entity disambiguation between nodes and rels (work only with the latest gpt-4)
class Graph(BaseModel):
    """Represents a graph document consisting of nodes and relationships.
    """
    nodes: Optional[node_schema_pydantic]
    relationships: Optional[rel_extended_schema]

In [26]:
combined_extended_llm = llm.with_structured_output(Graph)
chain_extended_combined = prompt | combined_extended_llm

In [27]:
chain_extended_combined.invoke({"input": text})

Graph(nodes=Nodes(Movie=[Movie(id='Dune', url=None, runtime=None, revenue=None, plotEmbedding=None, posterEmbedding=None, imdbRating=None, released=None, countries=['United States'], languages=['English'], plot='Set in the distant future, the film follows Paul Atreides as his family, the noble House Atreides, is thrust into a war for the deadly and inhospitable desert planet Arrakis.', imdbVotes=None, imdbId=None, year=2021, poster=None, movieId=None, tmdbId=None, title=None, budget=None)], Genre=None, User=None, Person=[Person(id='Denis Villeneuve', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Denis Villeneuve', poster=None, tmdbId=None), Person(id='Jon Spaihts', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Jon Spaihts', poster=None, tmdbId=None), Person(id='Eric Roth', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Eric Roth', poster=None, tmdbId=None), Person(id='Frank Herbert', url=None, bornIn=Non