In [2]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

graph_store = Neo4jPropertyGraphStore(
    username="recommendations",
    password="recommendations",
    database="recommendations",
    url="neo4j+s://demo.neo4jlabs.com:7687",
    enhanced_schema=True,
    create_indexes=False
)

  from pandas.core import (


In [38]:
import os
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding


os.environ["OPENAI_API_KEY"] = "sk-"
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-2024-11-20", temperature=0)

In [4]:
from llama_index.core import ChatPromptTemplate

system_prompt = """You are a data expert specializing in concise, natural language explanations. Your goal is to define properties of dataset elements in a way that emphasizes their real-world meaning and purpose. Include information about ranges or counts only if they are interesting or significant to the definition. Provide definitions that help answer practical questions users might ask, such as:

1. "How can I use this property to compare {label} Nodes?"
2. "What does this property reveal about the relationships or behavior of {label} Nodes?"
"""

user_prompt = """Summarize the following property of a {label} Node:

{prop}
"""

prop_msgs = [
    (
        "system",
        system_prompt,
    ),
    ("user", user_prompt),
]

prop_prompt = ChatPromptTemplate.from_messages(prop_msgs)

In [5]:
system_prompt = """You are a data expert specializing in concise, natural language explanations. Your goal is to describe the types of questions or filters that can be applied to Neo4j node labels based on their properties. Explain how the properties of the label enable querying or analysis, highlighting which kinds of questions the data can answer. Mention specific filters, patterns, ranges, or groupings that are particularly meaningful or useful for understanding and exploring the label.  
"""

user_prompt = """Summarize the meaning and purpose of the {label} node in a Neo4j database based on the following property descriptions:

{prop_descriptions}
"""
prop_msgs = [
    (
        "system",
        system_prompt,
    ),
    ("user", user_prompt),
]

label_summary_prompt = ChatPromptTemplate.from_messages(prop_msgs)

In [6]:
# Actor, Director we ignore because multilabeled nodes
exclude_types = ["Actor", "Director"]

keys_to_remove_from_prompt = ["distinct_count"]

schema = graph_store.get_schema()
enriched_schema = {}

# Initialize the enriched schema with the original schema structure
enriched_schema.update(schema)
enriched_schema['node_props'] = {}

for node in schema['node_props']:
    # Remove excluded node types
    if node in exclude_types:
        continue

    prop_descriptions = []
    enriched_node_props = []
    # For every node label with properties
    for prop in schema['node_props'][node]:
        clean_prop = {k: v for k, v in prop.items() if k not in keys_to_remove_from_prompt}
        # Use LLM to generate description
        prop_description = await llm.achat(prop_prompt.format_messages(label=node, prop=clean_prop))
        prop_description = prop_description.message.content
        print(prop_description)
        # Store property description and its embedding
        clean_prop['description'] = prop_description
        clean_prop['description_embedding'] = embed_model.get_text_embedding(prop_description)
        
        prop_descriptions.append(prop_description)
        enriched_node_props.append(clean_prop)

    label_description = await llm.achat(label_summary_prompt.format_messages(label=node, prop_descriptions=prop_descriptions))
    label_description = label_description.message.content

    enriched_schema['node_props'][node] = {
        'label_description': label_description,
        'label_embedding': embed_model.get_text_embedding(label_description),
        'properties': enriched_node_props
    }

The `posterEmbedding` property represents a numerical encoding of a movie's poster image, capturing its visual features in a compact format. This embedding can be used to compare the visual similarity between movie posters or to group movies with similar poster styles. The values are part of a fixed-size vector, typically used in machine learning or recommendation systems.
The `url` property provides a direct link to the movie's page on The Movie Database (TMDb). This allows users to access detailed information about the movie, such as its cast, crew, reviews, and other metadata. It is useful for verifying data, exploring additional context, or comparing movies by their online presence.
The "runtime" property represents the total duration of a movie in minutes. It helps compare movies based on their length, revealing whether a film is short, standard, or unusually long. For example, a runtime of 910 minutes indicates an exceptionally long movie, while runtimes closer to 2 minutes sugge

In [74]:
#print(graph_store.get_schema_str())

In [13]:
import numpy as np

def cosine_similarity_sort(embeddings_list, query_embedding):
    """
    Calculate cosine similarity between a query embedding and a list of embeddings.
    
    Args:
        embeddings_list: List of tuples, where each tuple contains (id, embedding)
        query_embedding: Single embedding to compare against
        
    Returns:
        List of tuples containing (id, similarity_score), sorted by similarity in descending order
    """
    # Separate IDs and embeddings
    ids = [item[0] for item in embeddings_list]
    embeddings = [item[1] for item in embeddings_list]
    
    # Convert to numpy arrays
    embeddings_array = np.array(embeddings)
    query_array = np.array(query_embedding)
    
    # Ensure query_array is 1D
    query_array = query_array.reshape(1, -1)
    
    # Calculate dot product (using matrix multiplication)
    dot_product = np.dot(embeddings_array, query_array.T).flatten()
    
    # Calculate magnitudes
    embeddings_norm = np.linalg.norm(embeddings_array, axis=1)
    query_norm = np.linalg.norm(query_array)
    
    # Calculate cosine similarity
    similarities = dot_product / (embeddings_norm * query_norm)
    
    # Create list of (id, similarity) tuples
    results = list(zip(ids, similarities))
    
    # Sort by similarity score in descending order
    results.sort(key=lambda x: x[1], reverse=True)
    
    return results

In [14]:
from datetime import datetime

def get_relevant_types(text, comparison_list, k=3):
    query_embedding = embed_model.get_text_embedding(text)
    sort = cosine_similarity_sort(comparison_list,query_embedding)
    return sort[:k]

In [22]:
def get_relevant_tools(input):
    output = []
    # Get relevant node types
    labels_list = [(node, enriched_schema['node_props'][node]['label_embedding']) for node in enriched_schema['node_props']]
    labels = get_relevant_types(input, labels_list)
    # For each node label get most relevant properties
    for label in labels:
        potential_properties = [((prop['property'], prop['type']), prop['description_embedding']) for prop in enriched_schema['node_props'][label[0]]['properties']]
        props = get_relevant_types(input, potential_properties, 5)
        output.append({"node_label": label, "properties": props})
    return output

In [23]:
#%timeit get_relevant_tools("Where is Tom Hanks from")

In [24]:
get_relevant_tools("How many genres are there")

[{'node_label': ('Genre', 0.45076284553687046),
  'properties': [(('name', 'STRING'), 0.35702815318003717)]},
 {'node_label': ('Movie', 0.20537361062973428),
  'properties': [(('languages', 'LIST'), 0.2217072661005813),
   (('countries', 'LIST'), 0.22100465491268248),
   (('revenue', 'INTEGER'), 0.1789930268356038),
   (('budget', 'INTEGER'), 0.15955166327233392),
   (('runtime', 'INTEGER'), 0.1473851601939835)]},
 {'node_label': ('Person', 0.11470623864446454),
  'properties': [(('imdbId', 'STRING'), 0.05511708928345541),
   (('url', 'STRING'), 0.045275299278740556),
   (('bornIn', 'STRING'), 0.04500999886862818),
   (('bio', 'TEXT'), 0.04397884711788456),
   (('tmdbId', 'STRING'), 0.031159249972932273)]}]

In [25]:
get_relevant_tools("Who is Tom Hanks?")

[{'node_label': ('Person', 0.21214112251243697),
  'properties': [(('tmdbId', 'STRING'), 0.25392604230219945),
   (('imdbId', 'STRING'), 0.20609791533607974),
   (('url', 'STRING'), 0.20034625316889473),
   (('name', 'STRING'), 0.1683545045996092),
   (('bio', 'TEXT'), 0.15133074850980432)]},
 {'node_label': ('Movie', 0.18310718618633437),
  'properties': [(('tmdbId', 'STRING'), 0.24244398416650054),
   (('title', 'STRING'), 0.219017036751108),
   (('countries', 'LIST'), 0.2027850324367955),
   (('imdbId', 'STRING'), 0.1999078405675809),
   (('imdbVotes', 'INTEGER'), 0.19155435443834112)]},
 {'node_label': ('User', 0.12712079233912324),
  'properties': [(('name', 'STRING'), 0.1477346547297757),
   (('userId', 'STRING'), 0.13701414663348066)]}]

In [26]:
get_relevant_tools("What's the name of Tom Hanks?")

[{'node_label': ('Person', 0.20818689131971443),
  'properties': [(('tmdbId', 'STRING'), 0.2671541719414238),
   (('name', 'STRING'), 0.2365805322517942),
   (('imdbId', 'STRING'), 0.19971445001764604),
   (('url', 'STRING'), 0.19902483834005724),
   (('bio', 'TEXT'), 0.15449388828855956)]},
 {'node_label': ('Movie', 0.17939909347811545),
  'properties': [(('title', 'STRING'), 0.2884291802331046),
   (('tmdbId', 'STRING'), 0.26030864438159484),
   (('imdbId', 'STRING'), 0.2006082689237742),
   (('countries', 'LIST'), 0.19943722335068123),
   (('url', 'STRING'), 0.18734399969515683)]},
 {'node_label': ('User', 0.14750609411985074),
  'properties': [(('name', 'STRING'), 0.21169425244683782),
   (('userId', 'STRING'), 0.1287573271581887)]}]

In [111]:
# Storing the schema
import json

# Specify the file name
file_name = "schema.json"

# Write dictionary to JSON file
with open(file_name, 'w') as json_file:
    json.dump(schema, json_file, indent=4)

In [1]:
#enriched_schema['node_props']["Movie"]

In [70]:
from typing import Optional, List, Dict, Any, Union
from pydantic import BaseModel, create_model, Field
from enum import Enum

# Define operator enums for different property types
class StringOperator(str, Enum):
    CONTAINS = "CONTAINS"
    EQUALS = "EQUALS"
    STARTS_WITH = "STARTS_WITH"
    ENDS_WITH = "ENDS_WITH"

class NumberOperator(str, Enum):
    EQUALS = "EQUALS"
    GREATER_THAN = "GREATER_THAN"
    LESS_THAN = "LESS_THAN"
    GREATER_OR_EQUAL = "GREATER_OR_EQUAL"
    LESS_OR_EQUAL = "LESS_OR_EQUAL"

class BooleanOperator(str, Enum):
    IS_TRUE = "IS_TRUE"
    IS_FALSE = "IS_FALSE"

class ListOperator(str, Enum):
    EQUALS = "EQUALS"
    IN = "IN"

# Type mapping function with operator enums
def get_python_type_and_operator(prop_type: str):
    """Maps property types to Python types and corresponding operator enums."""
    type_mapping = {
        'STRING': (str, StringOperator),
        'TEXT': (str, StringOperator),
        'LIST': (List[str], ListOperator),
        'INT': (int, NumberOperator),
        'FLOAT': (float, NumberOperator),
        'BOOL': (bool, BooleanOperator),
    }
    return type_mapping.get(prop_type, (Any, None))

def create_node_class(properties: List[tuple]) -> dict:
    """Creates field definitions for a node class based on properties."""
    field_definitions = {}
    for (prop_name, prop_type), _ in properties:
        python_type, operator_enum = get_python_type_and_operator(prop_type)

        # Add the operator attribute for properties
        operator_field_name = f"{prop_name}_operator"
        field_definitions[prop_name] = (Optional[python_type], Field(
            None,
            description=f"Property '{prop_name}' of type '{prop_type}'"
        ))
        if operator_enum:
            field_definitions[operator_field_name] = (Optional[operator_enum], Field(
                None,
                description=f"Operator for '{prop_name}'"
            ))
    return field_definitions

def generate_schema_class(input_data: List[Dict[str, Any]]):
    """Generates the main Pydantic class with nested node classes."""
    # First, create individual node classes
    node_classes = {}
    for node in input_data:
        node_label = node['node_label'][0]
        properties = node['properties']

        # Create the node class
        node_class = create_model(
            f"{node_label}Node",
            **create_node_class(properties)
        )
        node_classes[node_label] = (Optional[node_class], Field(
            None,
            description=f"Node representing {node_label}"
        ))

    # Create the main schema class
    SchemaClass = create_model("Schema", **node_classes)
    return SchemaClass

In [71]:
# Recursive function to get non-null keys and values
def get_non_null_values(instance):
    if isinstance(instance, BaseModel):
        # Iterate through the model fields and recursively filter
        return {
            key: get_non_null_values(value)
            for key, value in instance.dict(exclude_none=True).items()
        }
    elif isinstance(instance, dict):
        # Handle nested dictionaries
        return {key: get_non_null_values(value) for key, value in instance.items()}
    else:
        # Return the value if it's not None
        return instance

In [78]:
def construct_cypher(non_null_keys):
    for key, value in non_null_keys.items():
        cypher = f"MATCH (n:`{key}`) "
        
        filters = []
        params = {}
        
        # Group properties and their operators
        properties = {}
        for prop_key, prop_value in value.items():
            if not prop_key.endswith('_operator'):
                operator_key = f"{prop_key}_operator"
                operator = value.get(operator_key, "EQUALS")
                properties[prop_key] = {
                    'value': prop_value,
                    'operator': operator
                }

        # Create filters and params
        for i, (prop, details) in enumerate(properties.items()):
            # Define Cypher operator syntax mapping
            operator_mapping = {
                "EQUALS": "=",
                "CONTAINS": "CONTAINS",
                "STARTS_WITH": "STARTS WITH",
                "ENDS_WITH": "ENDS WITH",
                "GREATER_THAN": ">",
                "LESS_THAN": "<",
                "GREATER_OR_EQUAL": ">=",
                "LESS_OR_EQUAL": "<=",
                "IN": "IN"
            }
            
            # Get the operator string value if it's an enum
            operator = str(details['operator']).split('.')[-1].strip("'")
            cypher_operator = operator_mapping.get(operator, "=")
            
            # Define parameter and filter
            prop_param = f"prop_{i}"
            filters.append(f"n.`{prop}` {cypher_operator} ${prop_param}")
            params[prop_param] = details['value']

        if filters:
            cypher += f"WHERE {' AND '.join(filters)} "

        # Counting return
        cypher += "RETURN count(*) AS count"
        return cypher, params

In [79]:
def retriever(input):
    # Generate Pydantic Schema
    schema_input = get_relevant_tools(input)
    #print(schema_input)
    SchemaObject = generate_schema_class(schema_input)
    # LLM with structured output
    tool_inputs = get_non_null_values((
        llm.as_structured_llm(SchemaObject)
        .complete(input)
        .raw
    ))
    #print(tool_inputs)
    # Deterministically generate Cypher statement
    cypher, params = construct_cypher(tool_inputs)
    print(cypher, params)
    # Execute Cypher statement
    results = graph_store.structured_query(cypher, param_map=params)
    return results

    

In [80]:
retriever("How many Tom Hanks in the db?")

MATCH (n:`Person`) WHERE n.`name` = $prop_0 RETURN count(*) AS count {'prop_0': 'Tom Hanks'}


[{'count': 1}]

In [81]:
retriever("How many movies in japan language?")

MATCH (n:`Movie`) WHERE n.`languages` = $prop_0 RETURN count(*) AS count {'prop_0': ['Japanese']}


[{'count': 133}]

In [82]:
retriever("How many movies where at least one of the languages is japanese?")

MATCH (n:`Movie`) WHERE n.`languages` IN $prop_0 RETURN count(*) AS count {'prop_0': ['Japanese']}


[{'count': 0}]

In [83]:
retriever("How many movies have imdb rating less than 7.5?")

MATCH (n:`Movie`) WHERE n.`imdbRating` < $prop_0 RETURN count(*) AS count {'prop_0': 7.5}


[{'count': 6746}]