In [3]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

graph_store = Neo4jPropertyGraphStore(
    username="recommendations",
    password="recommendations",
    database="recommendations",
    url="neo4j+s://demo.neo4jlabs.com:7687",
    enhanced_schema=True,
    create_indexes=False
)

  from pandas.core import (


In [4]:
import os
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding


os.environ["OPENAI_API_KEY"] = "sk-"
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-2024-11-20", temperature=0)

In [5]:
from llama_index.core import ChatPromptTemplate

system_prompt = """You are a data expert specializing in concise, natural language explanations. Your goal is to define properties of dataset elements in a way that emphasizes their real-world meaning and purpose. Include information about ranges or counts only if they are interesting or significant to the definition. Provide definitions that help answer practical questions users might ask, such as:

1. "How can I use this property to compare {label} Nodes?"
2. "What does this property reveal about the relationships or behavior of {label} Nodes?"
"""

user_prompt = """Summarize the following property of a {label} Node:

{prop}
"""

prop_msgs = [
    (
        "system",
        system_prompt,
    ),
    ("user", user_prompt),
]

prop_prompt = ChatPromptTemplate.from_messages(prop_msgs)

In [6]:
system_prompt = """You are a data expert specializing in concise, natural language explanations. Your goal is to describe the types of questions or filters that can be applied to Neo4j node labels based on their properties. Explain how the properties of the label enable querying or analysis, highlighting which kinds of questions the data can answer. Mention specific filters, patterns, ranges, or groupings that are particularly meaningful or useful for understanding and exploring the label.  
"""

user_prompt = """Summarize the meaning and purpose of the {label} node in a Neo4j database based on the following property descriptions:

{prop_descriptions}
"""
prop_msgs = [
    (
        "system",
        system_prompt,
    ),
    ("user", user_prompt),
]

label_summary_prompt = ChatPromptTemplate.from_messages(prop_msgs)

In [7]:
system_prompt = """"You are a data expert specializing in concise, natural language explanations. Your goal is to describe relationships in a Neo4j graph database based on their source node, target node, and relationship type. Explain how the properties of the source and target nodes, along with the relationship type, enable querying or analysis. Highlight specific filters, patterns, or traversals that are meaningful for understanding and exploring these relationships. Emphasize how this relationship contributes to the overall structure and insights of the graph."""
user_prompt = """Summarize the meaning and purpose of the relationship labeled {relationship_type} in a Neo4j database based on the following source and target node descriptions:

Source Node: {source_node_description}
Target Node: {target_node_description}"""

rel_msgs = [
    (
        "system",
        system_prompt,
    ),
    ("user", user_prompt),
]

rel_summary_prompt = ChatPromptTemplate.from_messages(rel_msgs)

In [8]:
#graph_store.get_schema()

In [9]:
# Actor, Director we ignore because multilabeled nodes
exclude_types = ["Actor", "Director"]

keys_to_remove_from_prompt = ["distinct_count"]

schema = graph_store.get_schema()
enriched_schema = {}

# Initialize the enriched schema with the original schema structure
enriched_schema.update(schema)
enriched_schema['node_props'] = {}

# Create a dictionary to store node neighbors
node_neighbors = {}
for rel in schema['relationships']:
    start_node = rel['start']
    end_node = rel['end']
    rel_type = rel['type']
    # Remove excluded types
    if any(el in exclude_types for el in [start_node, end_node, rel_type]):
        continue
    
    # Initialize lists if they don't exist
    if start_node not in node_neighbors:
        node_neighbors[start_node] = []
    if end_node not in node_neighbors:
        node_neighbors[end_node] = []
        
    # Todo/Maybe LLM generated descriptions
    """
    source_node_description
    target_node_description
    rel_description = await llm.achat(rel_summary_prompt.format_messages(
        source_node_description=source_node_description, 
        target_node_description=target_node_description, 
        relationship_type=rel_type
    ))
    rel_description = prop_description.message.content
    """
    rel_description = f"{start_node} is connected to {end_node} through relationship {rel_type}"
    
    # Add neighbors in both directions with more detailed structure
    node_neighbors[start_node].append({
        'start_node': start_node,
        'relationship_type': rel_type,
        'target_node': end_node,
        'neighbor_description': rel_description,
        'description_embedding': embed_model.get_text_embedding(rel_description),
        'direction': "NATURAL"
    })
    
    node_neighbors[end_node].append({
        'start_node': start_node,
        'relationship_type': rel_type,
        'target_node': start_node,
        'neighbor_description': rel_description,
        'description_embedding': embed_model.get_text_embedding(rel_description),
        'direction': "REVERSE"
    })

for node in schema['node_props']:
    # Remove excluded node types
    if node in exclude_types:
        continue

    prop_descriptions = []
    enriched_node_props = []
    # For every node label with properties
    for prop in schema['node_props'][node]:
        clean_prop = {k: v for k, v in prop.items() if k not in keys_to_remove_from_prompt}
        # Use LLM to generate description
        prop_description = await llm.achat(prop_prompt.format_messages(label=node, prop=clean_prop))
        prop_description = prop_description.message.content
        print(prop_description)
        # Store property description and its embedding
        clean_prop['description'] = prop_description
        clean_prop['description_embedding'] = embed_model.get_text_embedding(prop_description)
        
        prop_descriptions.append(prop_description)
        enriched_node_props.append(clean_prop)
    # Get node label descriptions
    label_description = await llm.achat(label_summary_prompt.format_messages(label=node, prop_descriptions=prop_descriptions))
    label_description = label_description.message.content

    enriched_schema['node_props'][node] = {
        'label_description': label_description,
        'label_embedding': embed_model.get_text_embedding(label_description),
        'properties': enriched_node_props,
        'neighbors': node_neighbors.get(node, [])  # Now contains list of neighbor objects
    }

The `url` property provides a direct link to the movie's page on The Movie Database (TMDb). This allows users to access detailed information about the movie, such as its cast, crew, reviews, and other metadata. It is useful for verifying data, exploring additional context, or comparing movies by their online presence.
The "runtime" property represents the total duration of a movie in minutes. It helps compare movies based on their length, revealing whether a film is short, standard, or unusually long. This can be useful for planning viewing time or understanding the pacing and scope of a movie.
The "revenue" property represents the total earnings a movie has generated, typically in US dollars. It helps compare the financial success of movies, revealing which ones performed better at the box office. The range spans from modest earnings to record-breaking figures like $2.78 billion.
The `plotEmbedding` property represents a numerical vector that encodes the essence of a movie's plot. It 

In [74]:
#print(graph_store.get_schema_str())

In [61]:
import numpy as np

def cosine_similarity_sort(embeddings_list, query_embedding):
    """
    Calculate cosine similarity between a query embedding and a list of embeddings.
    
    Args:
        embeddings_list: List of tuples, where each tuple contains (id, embedding)
        query_embedding: Single embedding to compare against
        
    Returns:
        List of tuples containing (id, similarity_score), sorted by similarity in descending order
    """
    # Separate IDs and embeddings
    ids = [item[0] for item in embeddings_list]
    embeddings = [item[1] for item in embeddings_list]
    
    # Convert to numpy arrays
    embeddings_array = np.array(embeddings)
    query_array = np.array(query_embedding)
    
    # Ensure query_array is 1D
    query_array = query_array.reshape(1, -1)
    
    # Calculate dot product (using matrix multiplication)
    dot_product = np.dot(embeddings_array, query_array.T).flatten()
    
    # Calculate magnitudes
    embeddings_norm = np.linalg.norm(embeddings_array, axis=1)
    query_norm = np.linalg.norm(query_array)
    
    # Calculate cosine similarity
    similarities = dot_product / (embeddings_norm * query_norm)
    
    # Create list of (id, similarity) tuples
    results = list(zip(ids, similarities))
    
    # Sort by similarity score in descending order
    results.sort(key=lambda x: x[1], reverse=True)
    
    return results

In [62]:
from datetime import datetime

def get_relevant_types(text, comparison_list, k=3):
    query_embedding = embed_model.get_text_embedding(text)
    sort = cosine_similarity_sort(comparison_list,query_embedding)
    return sort[:k]

In [63]:
def get_relevant_tools(input):
    output = []
    # Get relevant node types
    labels_list = [
        (node, enriched_schema["node_props"][node]["label_embedding"])
        for node in enriched_schema["node_props"]
    ]
    labels = get_relevant_types(input, labels_list)
    # For each node label get most relevant properties
    for label in labels:
        potential_properties = [
            ((prop["property"], prop["type"]), prop["description_embedding"])
            for prop in enriched_schema["node_props"][label[0]]["properties"]
            if not prop["type"] in ["EMBEDDING"]
        ]
        potential_neighbors = [
            ((prop["start_node"], prop["relationship_type"], prop["target_node"]), prop["description_embedding"])
            for prop in enriched_schema["node_props"][label[0]]["neighbors"]
            
        ]
        props = get_relevant_types(input, [*potential_properties, *potential_neighbors], 15)
        output.append({"node_label": label, "properties": props})
    return output

In [64]:
#%timeit get_relevant_tools("Where is Tom Hanks from")

In [122]:
#get_relevant_tools("How many genres are there")

In [123]:
#get_relevant_tools("Who is Tom Hanks?")

In [124]:
#get_relevant_tools("What's the name of Tom Hanks?")

In [68]:
# Storing the schema
import json

# Specify the file name
file_name = "schema.json"

# Write dictionary to JSON file
with open(file_name, 'w') as json_file:
    json.dump(schema, json_file, indent=4)

In [69]:
#enriched_schema['node_props']["Movie"]

In [94]:
from typing import Optional, List, Dict, Any, Union
from pydantic import BaseModel, create_model, Field
from enum import Enum

# Define operator enums for different property types
class StringOperator(str, Enum):
    CONTAINS = "CONTAINS"
    EQUALS = "EQUALS"
    STARTS_WITH = "STARTS_WITH"
    ENDS_WITH = "ENDS_WITH"

class NumberOperator(str, Enum):
    EQUALS = "EQUALS"
    GREATER_THAN = "GREATER_THAN"
    LESS_THAN = "LESS_THAN"
    GREATER_OR_EQUAL = "GREATER_OR_EQUAL"
    LESS_OR_EQUAL = "LESS_OR_EQUAL"

class BooleanOperator(str, Enum):
    IS_TRUE = "IS_TRUE"
    IS_FALSE = "IS_FALSE"

class ListOperator(str, Enum):
    EQUALS = "EQUALS"
    IN = "IN"

# Type mapping function with operator enums
def get_python_type_and_operator(prop_type: str):
    """Maps property types to Python types and corresponding operator enums."""
    type_mapping = {
        'STRING': (str, StringOperator),
        'TEXT': (str, StringOperator),
        'LIST': (List[str], ListOperator),
        'INT': (int, NumberOperator),
        'FLOAT': (float, NumberOperator),
        'BOOL': (bool, BooleanOperator),
    }
    return type_mapping.get(prop_type, (Any, None))

class Relationship(BaseModel):
    count: Optional[bool] = Field(None, description="Should we count the number of relationships")

def create_node_class(properties: List[tuple]) -> dict:
    """Creates field definitions for a node class based on properties."""
    field_definitions = {}
    
    for prop_tuple in properties:
        prop, confidence = prop_tuple  # Unpack the property tuple and its confidence score
        
        if len(prop) == 2:  # Regular property
            prop_name, prop_type = prop
            python_type, operator_enum = get_python_type_and_operator(prop_type)
    
            # Add the operator attribute for properties
            operator_field_name = f"{prop_name}_operator"
            field_definitions[str(prop_name)] = (Optional[python_type], Field(
                None,
                description=f"Property '{prop_name}' of type '{prop_type}'"
            ))
            if operator_enum:
                field_definitions[str(operator_field_name)] = (Optional[operator_enum], Field(
                    None,
                    description=f"Operator for '{prop_name}'"
                ))
        else:  # Relationship
            source_node, rel_type, target_node = prop
            # Create relationship field name
            relationship_field_name = f"{rel_type}___{target_node}"
            
            # Add relationship field with Relationship class instead of RelationshipProperties
            field_definitions[str(relationship_field_name)] = (
                Optional[Relationship],
                Field(
                    None,
                    description=f"Relationship {rel_type} from {source_node} to {target_node}"
                )
            )
            
    return field_definitions

def generate_schema_class(input_data: List[Dict[str, Any]]):
    """Generates the main Pydantic class with nested node classes."""
    # First, create individual node classes
    node_classes = {}
    for node in input_data:
        node_label, confidence = node['node_label']
        properties = node['properties']

        # Create the node class
        node_class = create_model(
            f"{node_label}Node",
            **create_node_class(properties)
        )
        node_classes[str(node_label)] = (Optional[node_class], Field(
            None,
            description=f"Node representing {node_label}"
        ))

    # Create the main schema class
    SchemaClass = create_model("Schema", **node_classes)
    return SchemaClass

In [96]:
schemaa = generate_schema_class(get_relevant_tools("What's the name of Tom Hanks?"))

In [97]:
# Recursive function to get non-null keys and values
def get_non_null_values(instance):
    if isinstance(instance, BaseModel):
        # Iterate through the model fields and recursively filter
        return {
            key: get_non_null_values(value)
            for key, value in instance.dict(exclude_none=True).items()
        }
    elif isinstance(instance, dict):
        # Handle nested dictionaries
        return {key: get_non_null_values(value) for key, value in instance.items()}
    else:
        # Return the value if it's not None
        return instance

In [114]:
def construct_cypher(non_null_keys):
    for key, value in non_null_keys.items():
        cypher = f"MATCH (n:`{key}`) "
        
        filters = []
        params = {}
        count_matches = []
        
        # Group properties and their operators
        properties = {}
        for prop_key, prop_value in value.items():
            if not prop_key.endswith('_operator'):
                operator_key = f"{prop_key}_operator"
                operator = value.get(operator_key, "EQUALS")
                properties[prop_key] = {
                    'value': prop_value,
                    'operator': operator
                }

        # Create filters and params
        for i, (prop, details) in enumerate(properties.items()):
            print(i, prop, details)
            # If we count relationships, probably need some better detection method
            if isinstance(details.get("value"), dict) and details.get("value").get("count"):
                rel_type, target_node = prop.split("___") # We hope nobody uses triple under
                print(rel_type, target_node)
                # Add a WITH clause to count relationships
                count_matches.append(
                    f"count{{ (n)-[:`{rel_type}`]->(:`{target_node}`)}} AS count_{rel_type}_{target_node}"
                )
            else: # If we filter of a node property
                # Define Cypher operator syntax mapping
                operator_mapping = {
                    "EQUALS": "=",
                    "CONTAINS": "CONTAINS",
                    "STARTS_WITH": "STARTS WITH",
                    "ENDS_WITH": "ENDS WITH",
                    "GREATER_THAN": ">",
                    "LESS_THAN": "<",
                    "GREATER_OR_EQUAL": ">=",
                    "LESS_OR_EQUAL": "<=",
                    "IN": "IN"
                }
                
                # Get the operator string value if it's an enum
                operator = str(details['operator']).split('.')[-1].strip("'")
                cypher_operator = operator_mapping.get(operator, "=")
                
                # Define parameter and filter
                prop_param = f"prop_{i}"
                filters.append(f"n.`{prop}` {cypher_operator} ${prop_param}")
                params[prop_param] = details['value']

        if filters:
            cypher += f"WHERE {' AND '.join(filters)} "


        if count_matches:
            cypher += "RETURN " + ", ".join(count_matches)
        else:
            # Counting return
            cypher += "RETURN count(*) AS count"
        return cypher, params

In [115]:
def retriever(input):
    # Generate Pydantic Schema
    schema_input = get_relevant_tools(input)
    #print(schema_input)
    SchemaObject = generate_schema_class(schema_input)
    # LLM with structured output
    tool_inputs = get_non_null_values((
        llm.as_structured_llm(SchemaObject)
        .complete(input)
        .raw
    ))
    #print(tool_inputs)
    # Deterministically generate Cypher statement
    cypher, params = construct_cypher(tool_inputs)
    print(cypher, params)
    # Execute Cypher statement
    results = graph_store.structured_query(cypher, param_map=params)
    return results

    

In [116]:
retriever("How many movies did Tom Hanks appear?")

0 ACTED_IN___Movie {'value': {'count': True}, 'operator': 'EQUALS'}
ACTED_IN Movie
1 name {'value': 'Tom Hanks', 'operator': <StringOperator.EQUALS: 'EQUALS'>}
MATCH (n:`Person`) WHERE n.`name` = $prop_1 RETURN count{ (n)-[:`ACTED_IN`]->(:`Movie`)} AS count_ACTED_IN_Movie {'prop_1': 'Tom Hanks'}


[{'count_ACTED_IN_Movie': 38}]

In [117]:
retriever("How many Tom Hanks in the db?")

0 name {'value': 'Tom Hanks', 'operator': <StringOperator.EQUALS: 'EQUALS'>}
MATCH (n:`Person`) WHERE n.`name` = $prop_0 RETURN count(*) AS count {'prop_0': 'Tom Hanks'}


[{'count': 1}]

In [118]:
retriever("How many movies in japan language?")

0 languages {'value': ['Japanese'], 'operator': <ListOperator.EQUALS: 'EQUALS'>}
MATCH (n:`Movie`) WHERE n.`languages` = $prop_0 RETURN count(*) AS count {'prop_0': ['Japanese']}


[{'count': 133}]

In [119]:
retriever("How many movies where at least one of the languages is japanese?")

0 languages {'value': ['Japanese'], 'operator': <ListOperator.IN: 'IN'>}
MATCH (n:`Movie`) WHERE n.`languages` IN $prop_0 RETURN count(*) AS count {'prop_0': ['Japanese']}


[{'count': 0}]

In [120]:
retriever("How many movies have imdb rating less than 7.5?")

0 imdbRating {'value': 7.5, 'operator': <NumberOperator.LESS_THAN: 'LESS_THAN'>}
MATCH (n:`Movie`) WHERE n.`imdbRating` < $prop_0 RETURN count(*) AS count {'prop_0': 7.5}


[{'count': 6746}]

In [121]:
retriever("How many person names start with H?")

0 name {'value': 'H', 'operator': <StringOperator.STARTS_WITH: 'STARTS_WITH'>}
MATCH (n:`Person`) WHERE n.`name` STARTS WITH $prop_0 RETURN count(*) AS count {'prop_0': 'H'}


[{'count': 536}]