In [1]:
!pip install --quiet llama-index-core llama-index-utils-workflow llama-index-llms-openai llama-index-graph-stores-neo4j 

In [1]:
import os
from llama_index.llms.openai import OpenAI
from llama_index.llms.anthropic import Anthropic
from neo4j.exceptions import CypherSyntaxError

from llama_index.core import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Literal, Union, Optional

from llama_index.core.workflow import (
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Event,
    Context,
)

  from pandas.core import (


In [2]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

graph_store = Neo4jPropertyGraphStore(
    username="recommendations",
    password="recommendations",
    database="recommendations",
    url="neo4j+s://demo.neo4jlabs.com:7687",
    enhanced_schema=True,
    create_indexes=False
)

In [275]:
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-api03-"
llm = Anthropic(model="claude-3-5-sonnet-latest", max_tokens=8076)
fast_llm = Anthropic(model="claude-3-5-haiku-20241022", max_tokens=8076)

In [276]:
class Guardrail(BaseModel):
    """Guardrail"""

    decision: Literal["movie", "end"] = Field(
        description="Decision on whether the question is related to movies"
        
    )
guardrails_system_prompt = """You are helpful assistant"""
guardrails_user_prompt = """
You are an intelligent assistant specializing in identifying movie-related questions. Your task is to determine whether a given question is related to movies or not.

Here's the question you need to analyze:

<question>
{question}
</question>

Instructions:
1. Carefully read and understand the question provided.
2. Determine if the question is related to movies. This includes topics such as:
   - Specific films
   - Actors or actresses
   - Directors or producers
   - Film industry topics
   - Movie genres
   - Film history
   - Cinema technology
   - Any other movie-related subjects

3. In <analysis> tags, break down your thought process as follows:
   a. List key words or phrases from the question that might indicate it's movie-related.
   b. Consider arguments for why the question could be movie-related.
   c. Consider arguments for why the question might not be movie-related.
   d. Make a final decision based on the strength of these arguments.

4. After your analysis, provide your final decision:
   - If the question is related to movies, output exactly: movie
   - If the question is not related to movies, output exactly: end
"""
# Refine Prompt
chat_refine_msgs = [
    (
        "system",
        guardrails_system_prompt,
    ),
    ("user", "The question is: {question}"),
]
guardrails_template = ChatPromptTemplate.from_messages(chat_refine_msgs)

In [277]:
class SubqueriesOutput(BaseModel):
    """Defines the output format for transforming a question into parallel-optimized retrieval steps."""
    question_breakdown: str = Field(description=("""Before creating the final query plan, please go through a thorough analysis phase. Wrap your analysis inside <question_breakdown> tags to show your thought process and break down the question. In your analysis:
1. Identify key entities and relationships mentioned in the question.
2. List potential Cypher patterns that might be useful for addressing the question.
3. Consider whether the question can be answered with a single Cypher statement or requires multiple steps.
4. If multiple steps are needed, consider potential optimization strategies, such as indexing or query rewriting.
5. Estimate the complexity of each proposed step (e.g., O(n), O(n^2), etc.).
6. Identify any opportunities for parallelization.
7. Identify potential constraints or limitations in the graph structure that might affect the query.
8. Consider different query execution paths and their trade-offs in terms of performance and resource usage.
9. Note any assumptions you're making about the graph structure or data availability.
10. Keep it concise enough"""))
    query_plan: List[List[str]] = Field(description=("""A list of query groups where:
        - Each group (inner list) contains queries that can be executed in parallel
        - Groups are ordered by dependency (earlier groups must be executed before later ones)
        - Each query must be a specific information retrieval request"""))

subqueries_system = """You are a helpful assistant"""

subqueries_user = """You are an advanced Query Planning Optimizer for Graph Databases, specifically designed to work with Cypher queries. Your task is to analyze complex questions about graph data and create efficient, structured query plans that can be translated into Cypher statements.

Here is the question you need to analyze and create a query plan for:

<question>
{question}
</question>

Please follow these steps to create an optimized query plan:

1. Analyze the Question:
Begin by thoroughly analyzing the question. Wrap your analysis in <question_analysis> tags. In your analysis:
a. Restate the question in your own words to ensure understanding.
b. List and number each entity and relationship mentioned in the question.
c. For each entity and relationship, note potential Cypher patterns and estimate their complexity.
d. Explicitly state whether the question requires a single Cypher statement or multiple steps.
e. For multi-step queries, list potential parallelization opportunities.
f. Enumerate any potential constraints or limitations in the graph structure.
g. Brainstorm potential edge cases that might affect the query results.

2. Create an Optimized Query Plan:
After your analysis, create an optimized query plan. Wrap your plan development process in <plan_development> tags. Your plan should:
- Utilize Cypher's strengths in handling relationships, aggregations, and filtering.
- Minimize sequential dependencies while maintaining logical correctness.
- Organize independent queries into parallel groups when possible.
- Use a single query for tasks that don't require breakdown into smaller steps.
- Ensure each step directly contributes to answering the user's question or preparing data for subsequent steps.
- Express steps as high-level queries or operations, not as specific Cypher statements.
- Address potential edge cases or limitations identified in the analysis.
- If a simple Cypher statement can address the entire question, implement it as a single operation in the query plan.
- For each step, list the input data required and the output data produced.
- Explicitly state any assumptions made about the graph structure.
- Note any potential performance bottlenecks and how they're addressed.

3. Present the Query Plan:
Present your final query plan in <query_plan> tags. The query plan should be a structured text format where:
- Each sequential step is numbered and clearly labeled with a descriptive name.
- Parallel operations within a step are indicated with appropriate indentation and markers.
- Dependencies between steps are clearly indicated.
- The order of execution is explicit and easy to follow.
- Step and operation names describe the information being retrieved or processed.
- Any considerations for edge cases or limitations are included in relevant steps.

Here's an example of the expected query plan structure:

Note: If the entire query can be efficiently executed in a single Cypher statement, present it as a single step with one operation.

<query_plan>
Step 1: Identify Tom Hanks' Most Frequent Coactor
  Purpose: Find the actor who has appeared most often with Tom Hanks
  Parallel: false
  Operations:
    - Operation 1.1: Match Tom Hanks in the database
    - Operation 1.2: Find all coactors and count shared movies
    - Operation 1.3: Select coactor with highest count
    - Operation 1.4: Store coactor information for next step

Step 2: Count Movies for Both Actors
  Purpose: Count total movies for both Leonardo DiCaprio and Tom Hanks' most frequent coactor
  Parallel: true
  Operations:
    - Operation 2.1: Count distinct movies for Leonardo DiCaprio
    - Operation 2.2: Count distinct movies for identified coactor from Step 1

Step 3: Compare Results
  Purpose: Compare movie counts and determine who made more movies
  Parallel: false
  Operations:
    - Operation 3.1: Compare movie counts
    - Operation 3.2: Format result with actor names and their respective movie counts

Notes:
- Edge Case Handling: If multiple coactors tie for most frequent, use the first one alphabetically
- Null Handling: Return appropriate message if Tom Hanks has no coactors
- Performance: Use indexes on Person labels and name properties if available
- Distinct Movies: Ensure counting unique movies only
</query_plan>

Please proceed with your analysis and query plan for the given question."""

query_decompose_msgs = [
    ("system", subqueries_system),
    ("user", subqueries_user)
]

subquery_template = ChatPromptTemplate.from_messages(query_decompose_msgs)

In [364]:
import json
import re

def extract_query_plan(text, original_question):
    # Handle list input by joining with newlines
    if isinstance(text, list):
        text = '\n'.join(text)
    elif not isinstance(text, str):
        raise TypeError("Input must be either a string or a list of strings")
    
    # Find the content between <query_plan> tags
    plan_match = re.search(r'<query_plan>(.*?)</query_plan>', text, re.DOTALL)
    if not plan_match:
        return None
    
    plan_text = plan_match.group(1)
    
    # Initialize the result structure
    query_plan = {
        "steps": []
    }
    
    # Split into steps
    step_pattern = r'Step \d+: (.*?)(?=Step \d+:|$)'
    steps = re.finditer(step_pattern, plan_text, re.DOTALL)
    
    for step_match in steps:
        step_content = step_match.group(0)
        
        # Extract step number
        step_num = re.search(r'Step (\d+):', step_content)
        step_num = int(step_num.group(1)) if step_num else 0
        
        # Extract step title
        step_title = re.search(r'Step \d+: (.*?)\n', step_content)
        step_title = step_title.group(1).strip() if step_title else ""
        
        # Extract purpose
        purpose = re.search(r'Purpose: (.*?)\n', step_content)
        purpose = purpose.group(1).strip() if purpose else ""
        
        # Extract parallel setting
        parallel = re.search(r'Parallel: (.*?)\n', step_content)
        parallel = parallel.group(1).strip().lower() == 'true' if parallel else None
        
        # Extract operations
        operations = []
        ops = re.finditer(r'Operation \d+\.\d+: (.*?)\n', step_content)
        for op in ops:
            operations.append(op.group(1).strip())
            
        # Extract output
        output = re.search(r'Output: (.*?)\n', step_content)
        output = output.group(1).strip() if output else ""
        
        # Create step dictionary
        step = {
            "step_number": step_num,
            "title": step_title,
        }
        
        # Only add non-empty values
        if purpose:
            step["purpose"] = purpose
        if parallel is not None:
            step["parallel"] = parallel
        if operations:
            step["operations"] = operations
        if output:
            step["output"] = output
            
        query_plan["steps"].append(step)
    
    # Extract notes if present
    notes_match = re.search(r'Notes:(.*?)(?=</query_plan>|$)', plan_text, re.DOTALL)
    if notes_match:
        notes_text = notes_match.group(1)
        notes = {}
        
        # Extract different note categories
        categories = re.finditer(r'- (.*?):(.*?)(?=-|$)', notes_text, re.DOTALL)
        for category in categories:
            category_name = category.group(1).strip()
            category_content = category.group(2).strip()
            
            # Convert bullet points to list
            items = [item.strip('* ').strip() for item in category_content.split('\n') if item.strip()]
            if items:
                notes[category_name] = items
                
        if notes:
            query_plan["notes"] = notes
    # Turn into lists of lists
    plan_steps = []

    # Iterate through the steps in the plan
    for step in query_plan['steps']:
        step_operations = []

        # If operations are parallel, each becomes an individual list item
        if step['parallel']:
            for operation in step['operations']:
                #reasoning = f"Reasoning: {step['purpose']}"
                step_operations.append(f"The original question was: {original_question}. Here, you have to resolve: {operation}")

        # If not parallel, concatenate all operations with reasoning
        else:
            joined_operations = f"The original question was: {original_question}. Here, you have to resolve: {step['purpose']}. Follow these steps:" + ", ".join(
                [f"{operation}" for operation in step['operations']]
            )
            step_operations.append(joined_operations)

        # Add the operations for this step to the result list
        plan_steps.append(step_operations)

    return plan_steps

In [365]:
print(extract_query_plan(llm.chat(subquery_template.format_messages(question="Which movie has the biggest cast?")).message.content, "Which movie has the biggest cast?"))

[['The original question was: Which movie has the biggest cast?. Here, you have to resolve: Identify the movie with the highest number of distinct actors. Follow these steps:Match all movie-actor relationships, Group by movie, Count distinct actors per movie, Order by actor count descending, Select top result with movie details and count']]


In [366]:
print(extract_query_plan(llm.chat(subquery_template.format_messages(question="Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor??")).message.content,"Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor??"))

[['The original question was: Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor??. Here, you have to resolve: Identify the actor who has appeared most frequently with Tom Hanks. Follow these steps:Match Tom Hanks in database, Find all coactors and count shared movies, Order by count descending and name ascending (for ties), Select top coactor'], ['The original question was: Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor??. Here, you have to resolve: Count DISTINCT movies for Leonardo DiCaprio', 'The original question was: Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor??. Here, you have to resolve: Count DISTINCT movies for top coactor from Step 1'], ['The original question was: Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor??. Here, you have to resolve: Determine who made more movies and format output. Follow these steps:Compare movie counts, Format result with actor names a

In [367]:
print(extract_query_plan(llm.chat(subquery_template.format_messages(question="List directors who have directed at least two movies starring the same actor. What are those movies?")).message.content, "List directors who have directed at least two movies starring the same actor. What are those movies?"))

[['The original question was: List directors who have directed at least two movies starring the same actor. What are those movies?. Here, you have to resolve: Identify directors who have worked with the same actor multiple times. Follow these steps:Match director-movie-actor patterns, Group by director and actor, Filter for pairs with at least two movies, Collect movie information'], ['The original question was: List directors who have directed at least two movies starring the same actor. What are those movies?. Here, you have to resolve: Structure the output in a readable format. Follow these steps:Format result set']]


In [369]:
def guardrails_step(question):
    guardrails_output = (
        fast_llm.as_structured_llm(Guardrail)
        .complete(guardrails_template.format(question=question))
        .raw
    ).decision
    if guardrails_output == 'end':
        context = "The question is not about movies or their case, so I cannot answer this question"
        return {"next_event": "generate_final_answer", "arguments": {"context": context, "question": question}}
    # Refactor into separate step
    queries_output = llm.chat(subquery_template.format_messages(question=question)).message.content
    
    return {"next_event": "generate_cypher", "arguments": {"plan": extract_query_plan(queries_output, question), "question": question}}


In [370]:
guardrails_step("Who has appeared in more movies: Leonardo DiCaprio or the actor who has co-starred most frequently with the director of Tom Hanks' most critically acclaimed movie??")

{'next_event': 'generate_cypher',
 'arguments': {'plan': [["The original question was: Who has appeared in more movies: Leonardo DiCaprio or the actor who has co-starred most frequently with the director of Tom Hanks' most critically acclaimed movie??. Here, you have to resolve: Find the highest-rated movie Tom Hanks appeared in. Follow these steps:Match Tom Hanks to his movies, Sort movies by rating/score, Select top-rated movie, Store movie information for next step"],
   ["The original question was: Who has appeared in more movies: Leonardo DiCaprio or the actor who has co-starred most frequently with the director of Tom Hanks' most critically acclaimed movie??. Here, you have to resolve: Identify the director of the most acclaimed movie. Follow these steps:Match director to selected movie, Store director information for next step"],
   ["The original question was: Who has appeared in more movies: Leonardo DiCaprio or the actor who has co-starred most frequently with the director of

In [371]:
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding(model="text-embedding-3-small")


examples = [
    {
        "question": "How many artists are there?",
        "query": "MATCH (a:Person)-[:ACTED_IN]->(:Movie) RETURN count(DISTINCT a)",
    },
    {
        "question": "Which actors played in the movie Casino?",
        "query": "MATCH (m:Movie {title: 'Casino'})<-[:ACTED_IN]-(a) RETURN a.name",
    },
    {
        "question": "How many movies has Tom Hanks acted in?",
        "query": "MATCH (a:Person {name: 'Tom Hanks'})-[:ACTED_IN]->(m:Movie) RETURN count(m)",
    },
    {
        "question": "List all the genres of the movie Schindler's List",
        "query": "MATCH (m:Movie {title: 'Schindler's List'})-[:IN_GENRE]->(g:Genre) RETURN g.name",
    },
    {
        "question": "Which actors have worked in movies from both the comedy and action genres?",
        "query": "MATCH (a:Person)-[:ACTED_IN]->(:Movie)-[:IN_GENRE]->(g1:Genre), (a)-[:ACTED_IN]->(:Movie)-[:IN_GENRE]->(g2:Genre) WHERE g1.name = 'Comedy' AND g2.name = 'Action' RETURN DISTINCT a.name",
    },
    {
        "question": "Which directors have made movies with at least three different actors named 'John'?",
        "query": "MATCH (d:Person)-[:DIRECTED]->(m:Movie)<-[:ACTED_IN]-(a:Person) WHERE a.name STARTS WITH 'John' WITH d, COUNT(DISTINCT a) AS JohnsCount WHERE JohnsCount >= 3 RETURN d.name",
    },
    {
        "question": "Identify movies where directors also played a role in the film.",
        "query": "MATCH (p:Person)-[:DIRECTED]->(m:Movie), (p)-[:ACTED_IN]->(m) RETURN m.title, p.name",
    },
    {
        "question": "Find the actor with the highest number of movies in the database.",
        "query": "MATCH (a:Actor)-[:ACTED_IN]->(m:Movie) RETURN a.name, COUNT(m) AS movieCount ORDER BY movieCount DESC LIMIT 1",
    },
]
"""
few_shot_nodes = []
for line in examples:
    few_shot_nodes.append(TextNode(text=f"{{'query':{line['query']}, 'question': {line['question']}))"))

few_shot_index = VectorStoreIndex(few_shot_nodes, embed_model=embed_model)
few_shot_retriever = few_shot_index.as_retriever(similarity_top_k=5)
"""

def get_fewshots(question):
    return [] #[el.text for el in few_shot_retriever.retrieve(question)]

In [372]:
generate_system = """Given an input question, convert it to a Cypher query. No pre-amble.
Do not wrap the response in any backticks or anything else. Respond with a Cypher statement only!"""

generate_user = """You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.
Do not wrap the response in any backticks or anything else. Respond with a Cypher statement only!
Here is the schema information
{schema}

Below are a number of examples of questions and their corresponding Cypher queries.

{fewshot_examples}

User input: {question}
Cypher query:"""

generate_cypher_msgs = [
    (
        "system",
        generate_system,
    ),
    ("user", generate_user),
]

text2cypher_prompt = ChatPromptTemplate.from_messages(generate_cypher_msgs)

In [373]:
schema = graph_store.get_schema_str(exclude_types=["Actor", "Director"])

async def generate_cypher(subquery):
    fewshot_examples = get_fewshots(subquery)
    resp = await llm.achat(text2cypher_prompt.format_messages(question=subquery, schema=schema, fewshot_examples=fewshot_examples))
    return resp.message.content

In [374]:
validate_cypher_system = """You are a helpful assistant"""

validate_cypher_user = """You are a specialized parser designed to analyze Cypher query statements and extract node property filters. Your task is to identify and extract properties used in WHERE clauses and pattern matching conditions, but only when they contain explicit literal values.

Here is the Cypher statement you need to analyze:

<cypher_statement>
{cypher}
</cypher_statement>

Please follow these steps to analyze the Cypher statement and extract the relevant filters:

1. Carefully read through the Cypher statement.
2. Identify all node labels and their associated property filters.
3. For each property filter:
   a. Determine if it matches against a literal value (string, number, or boolean).
   b. If it does, extract the property key and its matching literal value.
   c. If it doesn't (e.g., property-to-property comparisons), ignore it.
4. Format the extracted filters as a JSON object containing a "filters" array.

Rules for extraction:
- Only extract filters that match against literal values:
  * Quoted strings (e.g., 'John', "London")
  * Numbers (e.g., 30, 42.5)
  * Booleans (true, false)
- Include property filters from both MATCH patterns (e.g., (p:Person {name: 'John'})) and WHERE clauses with literal values (e.g., WHERE p.age > 30)
- Handle both simple equality and comparison operators with literal values
- Ignore property-to-property comparisons (e.g., WHERE m1.rating = m2.rating)
- Ignore variable references or dynamic values

Before providing the final output, break down the Cypher statement in <cypher_breakdown> tags:

1. List all node labels and their associated properties from the MATCH clauses.
2. Identify and list all WHERE clause conditions.
3. For each property filter, explicitly state whether it matches a literal value and why.
4. Count the number of valid filters identified.

This breakdown will help ensure a thorough interpretation of the data.

Output Format:
After your analysis, provide the final output as a JSON object with a "filters" array. Each element in the array should be an object with the following properties:
- node_label: The label of the node (e.g., "Person")
- property_key: The property name being filtered (e.g., "age")
- property_value: The literal value being matched (e.g., 30)

Example of the desired output structure:
{
  "filters": [
    {
      "node_label": "ExampleNode",
      "property_key": "exampleProperty",
      "property_value": "exampleValue"
    }
  ]
}

Please proceed with your analysis and provide the extracted filters in the specified JSON format."""

validate_cypher_msgs = [
    (
        "system",
        validate_cypher_system,
    ),
    ("user", validate_cypher_user),
]

validate_cypher_prompt = ChatPromptTemplate.from_messages(validate_cypher_msgs)

class Property(BaseModel):
    """
    Represents a filter condition based on a specific node property in a graph in a Cypher statement.
    """

    node_label: str = Field(
        description="The label of the node to which this property belongs."
    )
    property_key: str = Field(description="The key of the property being filtered.")
    property_value: str = Field(
        description="The value that the property is being matched against."
    )


class ValidateCypherOutput(BaseModel):
    """
    Represents the applied filters of a Cypher query's output.
    """
    filters: Optional[List[Property]] = Field(
        description="A list of property-based filters applied in the Cypher statement."
    )

In [375]:
from llama_index.graph_stores.neo4j import CypherQueryCorrector, Schema

# Cypher query corrector is experimental
corrector_schema = [
    Schema(el["start"], el["type"], el["end"])
    for el in graph_store.get_schema().get("relationships")
]
cypher_query_corrector = CypherQueryCorrector(corrector_schema)

In [376]:
def validate_cypher(question, cypher):
    """
    Validates the Cypher statements and maps any property values to the database.
    """
    errors = []
    mapping_errors = []
    # Check for syntax errors
    try:
        graph_store.structured_query(f"EXPLAIN {cypher}")
    except CypherSyntaxError as e:
        errors.append(e.message)
    # Experimental feature for correcting relationship directions
    corrected_cypher = cypher_query_corrector(cypher)
    if not corrected_cypher:
        errors.append("The generated Cypher statement doesn't fit the graph schema")
    # Use LLM for mapping for values
    llm_output =   (
        fast_llm.as_structured_llm(ValidateCypherOutput)
        .complete(validate_cypher_prompt.format(cypher=cypher))
        .raw
    )
    print(f"LLM:{llm_output}")
    if llm_output.filters:
        for filter in llm_output.filters:
            # Do mapping only for string values
            try:
                if (
                    not [
                        prop
                        for prop in graph_store.get_schema()["node_props"][
                            filter.node_label
                        ]
                        if prop["property"] == filter.property_key
                    ][0]["type"]
                    == "STRING"
                ):
                    continue
            except: # if property is hallucinated/doesn't exist in the schema # ToDo handle it better
                continue
            print(f"Mapping: {filter}")
            mapping = graph_store.structured_query(
                f"MATCH (n:{filter.node_label}) WHERE toLower(n.`{filter.property_key}`) = toLower($value) RETURN 'yes' LIMIT 1",
                {"value": filter.property_value},
            )
            if not mapping:
                print(
                    f"Missing value mapping for {filter.node_label} on property {filter.property_key} with value {filter.property_value}"
                )
                mapping_errors.append(
                    f"Could not find node in graph with label '{filter.node_label}' where property '{filter.property_key}' equals '{filter.property_value}'. "
                    f"Without this information, I cannot provide a complete answer to your question. "
                    f"If you meant something else, please rephrase your question or verify the specific {filter.property_key} you're asking about. "
                    f"Would you like to try with a different {filter.property_key} value?"
                )
    if mapping_errors:
        next_action = "end"
    elif errors:
        next_action = "correct_cypher"
    else:
        next_action = "execute_cypher"

    return {
        "next_action": next_action,
        "cypher_statement": corrected_cypher,
        "cypher_errors": errors,
        "mapping_errors": mapping_errors,
        "steps": ["validate_cypher"],
    }

In [377]:
correct_cypher_system = """You are a helpful assistant"""

correct_cypher_user = """You are an expert Cypher developer tasked with reviewing and correcting a Cypher statement written by a junior developer. Your goal is to provide an accurate and efficient Cypher statement that addresses all the identified errors while adhering to the given schema and answering the provided question.

Here is the database schema you should use as a reference:

<schema>
{schema}
</schema>

The question or task for which the Cypher statement was written:

<question>
{question}
</question>

The original Cypher statement written by the junior developer:

<original_cypher>
{cypher}
</original_cypher>

The errors identified in the original statement:

<errors>
{errors}
</errors>

Your task is to analyze the original Cypher statement, identify the issues based on the provided errors, and create a corrected version that adheres to the schema and answers the question effectively.

Please follow these steps:

1. Analyze the original Cypher statement and the provided errors.
2. Identify the specific issues that need to be addressed.
3. Develop a corrected Cypher statement that resolves all identified errors.
4. Ensure the corrected statement adheres to the given schema.
5. Verify that the corrected statement effectively answers the provided question.

Conduct your review inside <cypher_review> tags. In your review:
- List out each error and its corresponding correction
- Explain how each correction addresses the schema and question
- Consider any potential side effects of the corrections

After your review, provide the corrected Cypher statement without any additional explanations, comments, or formatting. Do not use backticks or any other wrapping characters.

Remember:
- Only output a Cypher statement as your final answer.
- Do not include any explanations or apologies in your final output.
- Do not respond to any questions that ask for anything other than constructing a Cypher statement.

Begin your response with your review in <cypher_review> tags, followed by the corrected Cypher statement inside <cypher_statement>."""

# Correct cypher
correct_cypher_msgs = [
    (
        "system",
        correct_cypher_system,
    ),
    ("user", correct_cypher_user),
]

correct_cypher_prompt = ChatPromptTemplate.from_messages(correct_cypher_msgs)

In [378]:
def extract_correct_cypher(text):
    # Look for content between <cypher_statement> and </cypher> (optional)
    start_tag = "<cypher_statement>"
    end_tag = "</cypher_statement>"
    
    # Find the start position
    start_pos = text.find(start_tag)
    if start_pos == -1:
        return None
    
    # Adjust start position to exclude the tag
    start_pos += len(start_tag)
    
    # Find the end position
    end_pos = text.find(end_tag)
    if end_pos == -1:
        # If no end tag, take until the end of the string
        cypher = text[start_pos:].strip()
    else:
        cypher = text[start_pos:end_pos].strip()
    
    return cypher

In [379]:
async def correct_cypher(subquery, cypher, errors):
    resp = await llm.achat(correct_cypher_prompt.format_messages(question=subquery, schema=schema, errors=errors, cypher=cypher))
    return extract_correct_cypher(resp.message.content)

In [380]:
print(await correct_cypher("Who is Tom Hanks?", "MATCH (a:Actor) RETURN a", ["The query doesn't match tom hanks specifically!"]))

MATCH (p:Person)-[r:ACTED_IN]->(m:Movie)
WHERE p.name = "Tom Hanks"
RETURN p.name, p.born, p.bornIn, p.bio, collect({movie: m.title, role: r.role}) as movies


In [381]:
information_check_system = """You are a helpful assistant"""

information_check_user = """
You are an advanced Query Planning Optimizer for Graph Databases, specifically designed to work with Cypher queries. Your task is to analyze complex questions about graph data, evaluate available information, and create efficient, structured query plans that can be translated into Cypher statements.

Here is the information you need to work with:

<subqueries>
{subqueries}
</subqueries>

<dynamic_notebook>
{dynamic_notebook}
</dynamic_notebook>

<current_plan>
{plan}
</current_plan>

<original_question>
{question}
</original_question>

Please follow these steps to evaluate the available information and create an optimized query plan:

1. Analyze the Original Question:
Begin by thoroughly analyzing the question. In your analysis:
a. Restate the question in your own words to ensure understanding.
b. List and number each entity and relationship mentioned in the question.
c. For each entity and relationship, note potential Cypher patterns and estimate their complexity.
d. Explicitly state whether the question requires a single Cypher statement or multiple steps.
e. For multi-step queries, list potential parallelization opportunities.
f. Enumerate any potential constraints or limitations in the graph structure.
g. Brainstorm potential edge cases that might affect the query results.

2. Review Available Information:
Examine the subqueries, their results, and the provided condensed information in the dynamic notebook. Assess if they collectively address all components of the question.

3. Identify Information Gaps:
Compare the requirements from the question against the available information. Highlight any missing details or incomplete data that must be retrieved to form a complete answer.

4. Update and Refine the Dynamic Notebook:
Treat the condensed information as a central knowledge base. Continuously update it with key details from subquery results and integrate new data to close gaps and establish connections between facts.

5. Create or Modify the Query Plan:
Based on your analysis, create or modify the query plan. Your plan should:
- Utilize Cypher's strengths in handling relationships, aggregations, and filtering.
- Minimize sequential dependencies while maintaining logical correctness.
- Organize independent queries into parallel groups when possible.
- Use a single query for tasks that don't require breakdown into smaller steps.
- Ensure each step directly contributes to answering the user's question or preparing data for subsequent steps.
- Express steps as high-level queries or operations, not as specific Cypher statements.
- Address potential edge cases or limitations identified in the analysis.
- For each step, list the input data required and the output data produced.
- Explicitly state any assumptions made about the graph structure.
- Note any potential performance bottlenecks and how they're addressed.

Wrap your analysis and plan in the following tags:

<question_breakdown>
[Your detailed analysis of the question, including:
1. Restatement of the question
2. Numbered list of entities and relationships
3. Cypher patterns for each entity/relationship
4. Single or multi-step query requirement
5. Parallelization opportunities
6. Potential constraints or limitations
7. Potential edge cases]
</question_breakdown>

<information_review>
[Your review of available information and identified gaps]
</information_review>

<dynamic_notebook_update>
[Updated version of the dynamic notebook, including old and new information and connections]
</dynamic_notebook_update>

<query_plan>
[Your optimized query plan, following this structure:

Step 1: [Step Name]
  Purpose: [Brief description of the step's purpose]
  Parallel: [true/false]
  Operations:
    - Operation 1.1: [Description of operation]
    - Operation 1.2: [Description of operation]
    [...]

Step 2: [Step Name]
  [...]

Notes:
- [Any additional notes, edge case handling, assumptions, or performance considerations]
]
</query_plan>

If the task is unsolvable due to critical missing information:
<unsolvable_task>
[Explanation of why the task cannot be completed, including specific missing data]
</unsolvable_task>

Remember:
- Focus only on information retrieval and avoid reasoning/analysis tasks in the query plan.
- Optimize for parallel execution whenever possible.
- Maintain sequential order only when necessary due to data dependencies.
- Centralize all knowledge in the dynamic notebook.
- If a simple Cypher statement can address the entire question, implement it as a single operation in the query plan.

Please proceed with your analysis and query plan for the given question and available information. 
"""

information_check_msgs = [
    (
        "system",
        information_check_system,
    ),
    ("user", information_check_user),
]

information_check_prompt = ChatPromptTemplate.from_messages(information_check_msgs)

class IFOutput(BaseModel):
    """
    Represents the output of an information sufficiency evaluation process. 
    Contains either a condensed summary of the available information or additional subqueries needed to answer the original question.
    """

    dynamic_notebook: str = Field(
        description="A continuously updated and refined summary integrating subquery results and condensed information. Serves as the central knowledge base to address the original question and guide further subqueries if necessary."
    )
    modified_plan: Optional[List[List[str]]] = Field(
        description="Modified version of the remaining plan steps. Each group contains queries that can be executed in parallel. Null if no remaining plan exists, all gaps have been addressed, or the task is unsolvable due to missing critical information."
    )

In [382]:
def format_subqueries_for_prompt(information_checks: list) -> str:
    """
    Converts a list of InformationCheck objects into a string that can be added to a prompt.
    
    Args:
        information_checks (List[InformationCheck]): List of information checks to process.
    
    Returns:
        str: A formatted string representing subqueries and their results.
    """
    subqueries_and_results = []
    
    for check in information_checks:
        # Extract the first result if available, otherwise use "No result available."
        result = (
            check.database_output[0] if check.database_output else "No result available."
        )
        subqueries_and_results.append(
            f"- Subquery: {check.subquery}\n  Result: {result}"
        )
    
    return "\n".join(subqueries_and_results)

def information_check(subquery_events, original_question, dynamic_notebook, plan):
    subqueries = format_subqueries_for_prompt(subquery_events)
    print(f"Before: {dynamic_notebook}")
    print(f"Plan: {plan}")
    llm_output =   (
        llm.as_structured_llm(IFOutput)
        .complete(information_check_prompt.format(subqueries=subqueries, original_question=original_question, dynamic_notebook=dynamic_notebook, plan=plan))
        .raw
    )
    print(f"After: {llm_output.dynamic_notebook}")
    print(f"New Plan: {llm_output.modified_plan}")
    return {'dynamic_notebook': llm_output.dynamic_notebook, 'modified_plan': llm_output.modified_plan}
        

In [383]:
final_answer_system = """You are a helpful assistant."""

final_answer_user = """
You are a highly capable AI assistant specializing in providing accurate and concise answers based on given information. Your task is to analyze a provided context and answer a specific question using only the information contained within that context.

Here is the context you will be working with:

<context>
{context}
</context>

And here is the question you need to answer:

<question>
{question}
</question>

Please follow these steps to complete your task:

1. Carefully read and analyze the context and question.
2. Wrap your thought process inside <thought_process> tags:
   - Quote key information from the context that is relevant to the question.
   - Consider whether the question is clear or if clarification is needed.
   - List potential answers to the question based on the context.
   - For each potential answer, provide arguments supporting it and evaluate the reliability and sufficiency of the information.
   - Determine if the context provides sufficient information to answer the question confidently.
   - If information is missing, specify what additional details are required.

It's OK for this section to be quite long.

3. Based on your analysis, do the following:
   a. Ensure your answer is clear, relevant, and directly addresses the user's question. Explain your reasoning within this section.

Remember:
- Focus solely on the provided context. Do not use external knowledge or make assumptions unless explicitly stated in the context.
- If the question is ambiguous, ask for clarification before proceeding.
- Be concise yet thorough in your responses.
- Maintain a professional and helpful tone throughout your response.

Please begin your thought process now.
"""

final_answer_msgs = [
    (
        "system",
        final_answer_system,
    ),
    ("user", final_answer_user),
]

final_answer_prompt = ChatPromptTemplate.from_messages(final_answer_msgs)

async def generate_final_answer(question, context):
    resp = await fast_llm.achat(final_answer_prompt.format_messages(question=question, context=context))
    return resp.message.content                 
    

In [384]:
class GenerateCypher(Event):
    subquery: str
    
class ValidateCypher(Event):
    subquery: str
    generated_cypher: str

class CorrectCypher(Event):
    cypher: str
    subquery: str
    errors: List[str]

class ExecuteCypher(Event):
    validated_cypher: str
    subquery: str

class InformationCheck(Event):
    cypher: str
    subquery: str
    database_output: list
    
class GenerateFinalAnswer(Event):
    context: str

class ConcurrentFlow(Workflow):
    @step
    async def start(self, ctx: Context, ev: StartEvent) -> GenerateCypher | GenerateFinalAnswer:
        original_question = ev.input
        await ctx.set("original_question", original_question)
        await ctx.set("dynamic_notebook", "")
        await ctx.set("subqueries_cypher_history", {})
        guardrails_output = guardrails_step(original_question)
        if guardrails_output.get("next_event") == "generate_final_answer":
            context = "The question is not about movies or cast, so I cannot answer the question"
            return GenerateFinalAnswer(context=context)

        # store in global context 
        subqueries = guardrails_output["arguments"].get("plan")
        await ctx.set("count_of_subqueries", len(subqueries[0])) #we use this in ctx.collect()
        await ctx.set("plan", subqueries[1:]) #we use this in information check
        # Send events
        for subquery in subqueries[0]:
            print(subquery)
            ctx.send_event(GenerateCypher(subquery=subquery))

    @step(num_workers=4)
    async def generate_cypher_step(self, ctx: Context, ev: GenerateCypher) -> ValidateCypher:
        print("Running generate_cypher ", ev.subquery)
        generated_cypher = await generate_cypher(ev.subquery)
        return ValidateCypher(subquery=ev.subquery, generated_cypher=generated_cypher)

    @step(num_workers=4)
    async def validate_cypher_step(self, ctx: Context, ev: ValidateCypher) -> GenerateFinalAnswer | ExecuteCypher | CorrectCypher:
        print("Running validate_cypher ", ev)
        results = validate_cypher(ev.subquery, ev.generated_cypher)
        print(results)
        if results['next_action'] == "end": # DB value mapping
            return GenerateFinalAnswer(context=str(results["mapping_errors"]))
        if results['next_action'] == "execute_cypher":
            return ExecuteCypher(subquery=ev.subquery, validated_cypher=ev.generated_cypher)
        if results['next_action'] == "correct_cypher":
            return CorrectCypher(subquery=ev.subquery, cypher=ev.generated_cypher, errors=results['cypher_errors'])

    @step(num_workers=4)
    async def correct_cypher_step(self, ctx: Context, ev: CorrectCypher) -> ValidateCypher:
        print("Running validate_cypher ", ev)
        results = await correct_cypher(ev.subquery, ev.cypher, ev.errors)
        return ValidateCypher(subquery=ev.subquery, generated_cypher=results)
    
    @step
    async def execute_cypher_step(self, ctx: Context, ev: ExecuteCypher) -> InformationCheck:
        # wait until we receive all events
        print("Running execute_cypher_step ", ev)
        database_output = graph_store.structured_query(ev.validated_cypher)
        return InformationCheck(subquery=ev.subquery, cypher=ev.validated_cypher, database_output=database_output)

    @step
    async def information_check_step(self, ctx: Context, ev: InformationCheck) -> GenerateCypher | GenerateFinalAnswer:
        # wait until we receive all events
        print("Running information_check_step", ev)
        # retrieve from context
        number_of_subqueries = await ctx.get("count_of_subqueries")
        result = ctx.collect_events(ev, [InformationCheck] * number_of_subqueries)
        if result is None:
            return None
        # Add executed cypher statements to global state
        subqueries_cypher_history = await ctx.get("subqueries_cypher_history")
        new_subqueries_cypher = {
                item.subquery: {
                    "cypher": item.cypher,
                    "database_output": item.database_output
                } for item in result
            }
        await ctx.set("subqueries_cypher_history", {**subqueries_cypher_history, **new_subqueries_cypher})

        original_question = await ctx.get("original_question")
        dynamic_notebook = await ctx.get("dynamic_notebook")
        plan = await ctx.get("plan")

        # Do the information check
        
        data = information_check(result, original_question, dynamic_notebook, plan)
        # Go fetch additional information if needed
        if data.get("modified_plan"):
            await ctx.set("count_of_subqueries", len(data['modified_plan'][0])) # this is used for ctx.collect()
            await ctx.set("dynamic_notebook", data["dynamic_notebook"])
            await ctx.set("plan", data.get("modified_plan")[1:])
            for subquery in data["modified_plan"][0]:
                ctx.send_event(GenerateCypher(subquery=subquery))
        else:
            return GenerateFinalAnswer(context=data['dynamic_notebook'])

    @step
    async def final_answer(self, ctx: Context, ev: GenerateFinalAnswer) -> StopEvent:
        original_question = await ctx.get("original_question")
        subqueries_cypher_history = await ctx.get("subqueries_cypher_history")
        # wait until we receive all events
        print("Running final_answer ", ev)
        resp = await generate_final_answer(original_question, ev.context)
        return StopEvent(result={"text":resp, "subqueries_cypher_history": subqueries_cypher_history})

In [386]:
from datetime import datetime

start = datetime.now()
w = ConcurrentFlow(timeout=120, verbose=True)
result = await w.run(input="Who made more movies, Leonardo DiCaprio or Tom Hanks?")
print(result)
print(f"Took {datetime.now() - start}")

Running step start
The original question was: Who made more movies, Leonardo DiCaprio or Tom Hanks?. Here, you have to resolve: Count and compare movies for both actors in a single operation. Follow these steps:Match both actors and their movies, Count distinct movies for each actor, Compare counts and determine who made more movies, Return results with both counts for verification
Step start produced no event
Running step generate_cypher_step
Running generate_cypher  The original question was: Who made more movies, Leonardo DiCaprio or Tom Hanks?. Here, you have to resolve: Count and compare movies for both actors in a single operation. Follow these steps:Match both actors and their movies, Count distinct movies for each actor, Compare counts and determine who made more movies, Return results with both counts for verification
Step generate_cypher_step produced event ValidateCypher
Running step validate_cypher_step
Running validate_cypher  subquery='The original question was: Who made 

In [387]:
start = datetime.now()
w = ConcurrentFlow(timeout=120, verbose=False)
result = await w.run(input="Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor?")
print(result)
print(f"Took {datetime.now() - start}")

The original question was: Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor?. Here, you have to resolve: Find the actor who has appeared most often with Tom Hanks. Follow these steps:Match Tom Hanks in database, Find all coactors and count shared movies, Order by frequency and alphabetically (for ties), Select top coactor
Running generate_cypher  The original question was: Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor?. Here, you have to resolve: Find the actor who has appeared most often with Tom Hanks. Follow these steps:Match Tom Hanks in database, Find all coactors and count shared movies, Order by frequency and alphabetically (for ties), Select top coactor
Running validate_cypher  subquery='The original question was: Who made more movies, Leonardo di Caprio or Tom Hanks most frequent coactor?. Here, you have to resolve: Find the actor who has appeared most often with Tom Hanks. Follow these steps:Match Tom Hanks in database,

Exception in callback Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-29c8825cf485', bound_args=<BoundArgumen...t coactor?'})>, instance=<__main__.Con...t 0x377715bd0>, context=<_contextvars...t 0x3747350c0>)(<WorkflowHandler cancelled>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273
handle: <Handle Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-29c8825cf485', bound_args=<BoundArgumen...t coactor?'})>, instance=<__main__.Con...t 0x377715bd0>, context=<_contextvars...t 0x3747350c0>)(<WorkflowHandler cancelled>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273>
Traceback (most recent call last):
  File "/Users/tomazbratanic/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/tomazbratanic/anaconda3/lib/p

{'next_action': 'correct_cypher', 'cypher_statement': 'WITH \n"Leonardo DiCaprio" AS actor1,\nMATCH (p1:Person {name: actor1})-[:ACTED_IN]->(m1:Movie)\nWITH actor1, COUNT(m1) as actor1_count,\nMATCH (p2:Person {name: "Tom Hanks"})-[:ACTED_IN]->(m:Movie)<-[:ACTED_IN]-(coactor:Person)\nWHERE coactor.name <> "Tom Hanks"\nWITH actor1, actor1_count, coactor, COUNT(m) as collaborations\nORDER BY collaborations DESC\nLIMIT 1\nMATCH (coactor)-[:ACTED_IN]->(m2:Movie)\nWITH actor1, actor1_count, coactor.name as coactor_name, COUNT(m2) as coactor_count\nRETURN \n    CASE \n        WHEN actor1_count > coactor_count THEN actor1 + \' with \' + toString(actor1_count) + \' movies\'\n        WHEN actor1_count < coactor_count THEN coactor_name + \' with \' + toString(coactor_count) + \' movies\'\n        ELSE \'Tie with \' + toString(actor1_count) + \' movies each\'\n    END as result', 'cypher_errors': ['Invalid input \'{\': expected an expression, \')\' or \',\' (line 3, column 18 (offset: 62))\n"MATC

CancelledError: 

In [None]:
start = datetime.now()
w = ConcurrentFlow(timeout=60, verbose=True)
result = await w.run(input="Who has appeared in more movies: Leonardo DiCaprio or the actor who has co-starred most frequently with the director of Tom Hanks' most critically acclaimed movie??")
print(result)
print(f"Took {datetime.now() - start}")

In [388]:
start = datetime.now()
w = ConcurrentFlow(timeout=60, verbose=True)
result = await w.run(input="How many movies did Tomaz Bratanic acted in?")
print(result)
print(f"Took {datetime.now() - start}")

Running step start
The original question was: How many movies did Tomaz Bratanic acted in?. Here, you have to resolve: Find and count all unique movies where Tomaz Bratanic acted. Follow these steps:Match Tomaz Bratanic in the database, Count distinct movies connected via ACTED_IN relationships, Return count with appropriate message
Step start produced no event
Running step generate_cypher_step
Running generate_cypher  The original question was: How many movies did Tomaz Bratanic acted in?. Here, you have to resolve: Find and count all unique movies where Tomaz Bratanic acted. Follow these steps:Match Tomaz Bratanic in the database, Count distinct movies connected via ACTED_IN relationships, Return count with appropriate message
Step generate_cypher_step produced event ValidateCypher
Running step validate_cypher_step
Running validate_cypher  subquery='The original question was: How many movies did Tomaz Bratanic acted in?. Here, you have to resolve: Find and count all unique movies whe

In [389]:
start = datetime.now()
w = ConcurrentFlow(timeout=30, verbose=False)
result = await w.run(input="What'up esse?")
print(result)
print(f"Took {datetime.now() - start}")

Running final_answer  context='The question is not about movies or cast, so I cannot answer the question'
{'text': '<thought_process>\nAnalysis of the context:\n- The context statement is: "The question is not about movies or cast, so I cannot answer the question"\n- This seems to be a generic response that doesn\'t provide any substantive information\n\nAnalysis of the question:\n- The question "What\'up esse?" appears to be an informal greeting\n- "Esse" is likely a slang term, possibly derived from "S" or "homie"\n- The context provides no relevant information to answer this specific greeting\n\nPotential answers:\n1. I cannot answer the question based on the given context\n2. The context does not provide any meaningful information to respond to the greeting\n\nEvaluation:\n- The context is not helpful in addressing the question\n- There are insufficient details to provide a meaningful response\n- The context seems like a placeholder or default response\n\nConclusion:\n- More contex

In [390]:
start = datetime.now()
w = ConcurrentFlow(timeout=60, verbose=True)
result = await w.run(input="List directors who have directed at least two movies starring the same actor. What are those movies?")
print(result)
print(f"Took {datetime.now() - start}")

Running step start
The original question was: List directors who have directed at least two movies starring the same actor. What are those movies?. Here, you have to resolve: Identify all combinations of directors, actors, and their shared movies. Follow these steps:Match directors with their movies, For each movie, match associated actors, Group by director and actor pairs, Count shared movies per director-actor pair
Step start produced no event
Running step generate_cypher_step
Running generate_cypher  The original question was: List directors who have directed at least two movies starring the same actor. What are those movies?. Here, you have to resolve: Identify all combinations of directors, actors, and their shared movies. Follow these steps:Match directors with their movies, For each movie, match associated actors, Group by director and actor pairs, Count shared movies per director-actor pair
Step generate_cypher_step produced event ValidateCypher
Running step validate_cypher_st

In [27]:
from llama_index.utils.workflow import (
    draw_all_possible_flows,
    draw_most_recent_execution,
)

draw_most_recent_execution(w, filename="joke_flow_recent.html")
draw_all_possible_flows(w, filename="joke_flow_recenst.html")

joke_flow_recent.html
<class 'NoneType'>
<class '__main__.ValidateCypher'>
<class '__main__.InformationCheck'>
<class 'llama_index.core.workflow.events.StopEvent'>
<class '__main__.ValidateCypher'>
<class '__main__.GenerateCypher'>
<class '__main__.GenerateFinalAnswer'>
<class '__main__.GenerateCypher'>
<class '__main__.GenerateFinalAnswer'>
<class '__main__.GenerateFinalAnswer'>
<class '__main__.ExecuteCypher'>
<class '__main__.CorrectCypher'>
joke_flow_recenst.html
