<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/llm/graph_based_prefiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet langchain langchain-community langchain-openai neo4j

In [2]:
import os
from typing import Dict, List, Optional, Tuple, Type

from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_function_messages
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.callbacks.manager import CallbackManagerForToolRun
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.pydantic_v1 import BaseModel, Field
from langchain.schema import AIMessage, HumanMessage
from langchain.tools import BaseTool
from langchain.tools.render import format_tool_to_openai_function
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

In [3]:
import os

os.environ["OPENAI_API_KEY"] = "sk-"
os.environ["NEO4J_URI"] = "neo4j+s://demo.neo4jlabs.com"
os.environ["NEO4J_USERNAME"] = "companies"
os.environ["NEO4J_PASSWORD"] = "companies"
os.environ["NEO4J_DATABASE"] = "companies"

In [4]:
embeddings = OpenAIEmbeddings()
graph = Neo4jGraph()
vector = Neo4jVector.from_existing_index(
    embeddings,
    index_name="news"
)

In [5]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~0.8) to each word, then combines them using the AND
    operator. Useful for mapping movies and people from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

candidate_query = """
CALL db.index.fulltext.queryNodes($index, $fulltextQuery, {limit: $limit})
YIELD node
WHERE node:Organization // Filter organization nodes
RETURN distinct node.name AS candidate
"""


def get_candidates(input: str, limit: int = 5) -> List[Dict[str, str]]:
    """
    Retrieve a list of candidate entities from database based on the input string.

    This function queries the Neo4j database using a full-text search. It takes the
    input string, generates a full-text query, and executes this query against the
    specified index in the database. The function returns a list of candidates
    matching the query.
    """
    ft_query = generate_full_text_query(input)
    candidates = graph.query(
        candidate_query, {"fulltextQuery": ft_query, "index": 'entity', "limit": limit}
    )
    # If there is direct match return only that, otherwise return all options
    direct_match = [el["candidate"] for el in candidates if el["candidate"].lower() == input.lower()]
    if direct_match:
        return direct_match

    return [el["candidate"] for el in candidates]

In [6]:
get_candidates("neo4")

['Net4', 'Neo4j', 'Neos', 'Neo', 'Neon Software']

In [7]:
def get_organization_news(
    topic: Optional[str] = None,
    organization: Optional[str] = None,
    country: Optional[str] = None,
    sentiment: Optional[str] = None,
) -> str:
    print(f"topic: {topic}")
    if topic and not organization and not country and not sentiment:
        return vector.similarity_search(topic)
    # Uses parallel runtime where available
    base_query = (
        "CYPHER runtime = parallel parallelRuntimeSupport=all "
        "MATCH (c:Chunk)<-[:HAS_CHUNK]-(a) WHERE "
    )
    where_queries = []
    params = {"k": 5}
    if organization:
        # Map to database
        candidates = get_candidates(organization)
        if len(candidates) > 1:  # Ask for follow up if too many options
            return f"Ask a follow up question which of the available organizations did the user mean. Available options: {candidates}"
        where_queries.append(
            "EXISTS {(a)-[:MENTIONS]->(:Organization {name: $organization})}"
        )
        params["organization"] = candidates[0]
    if country:
        # No need to disambiguate
        where_queries.append(
            "EXISTS {(a)-[:MENTIONS]->(:Organization)-[:IN_CITY]->()-[:IN_COUNTRY]->(:Country {name: $country})}"
        )
        params["country"] = country

    if sentiment:
        if sentiment == "positive":
            where_queries.append("a.sentiment > $sentiment")
            params["sentiment"] = 0.5
        else:
            where_queries.append("a.sentiment < $sentiment")
            params["sentiment"] = -0.5
    if topic:  # Do vector comparison
        vector_snippet = " WITH c, a, vector.similarity.cosine(c.embedding,$embedding) AS score ORDER BY score DESC LIMIT toInteger($k) "
        params["embedding"] = embeddings.embed_query(topic)
    else:  # Just return the latest data
        vector_snippet = " WITH c, a ORDER BY a.date DESC LIMIT toInteger($k) "

    return_snippet = "RETURN '#title ' + a.title + '\n#date ' + toString(a.date) + '\n#text ' + c.text AS output"

    complete_query = (
        base_query + " AND ".join(where_queries) + vector_snippet + return_snippet
    )
    print(f"Cypher: {complete_query}\n")
    data = graph.query(complete_query, params)
    return "###Article: ".join([el["output"] for el in data])

In [8]:
get_organization_news(organization='neo4j', sentiment='positive')

topic: None
Cypher: CYPHER runtime = parallel parallelRuntimeSupport=all MATCH (c:Chunk)<-[:HAS_CHUNK]-(a) WHERE EXISTS {(a)-[:MENTIONS]->(:Organization {name: $organization})} AND a.sentiment > $sentiment WITH c, a ORDER BY a.date DESC LIMIT toInteger($k) RETURN '#title ' + a.title + '
#date ' + toString(a.date) + '
#text ' + c.text AS output



'#title Neo4j Announces New Product Integrations with Generative AI Features in Google Cloud Vertex AI\n#date 2023-06-07T13:00:00Z\n#text \'s partnership with Google represents a powerful union of graph technology and cloud computing excellence in a new era of AI," said Emil Eifrem, Co-Founder and CEO, Neo4j. "Together, we empower enterprises seeking to leverage generative AI to better innovate, provide the best outcome for their customers, and unlock the true power of their connected data at unprecedented speed."\nAbout Neo4j\nNeo4j, the Graph Database & Analytics leader, helps organizations find hidden relationships and patterns across billions of data connections deeply, easily and quickly. Customers leverage the structure of their connected data to reveal new ways of solving their most pressing business problems, from fraud detection, customer 360, knowledge graphs, supply chain, personalization, IoT, network management, and more – even as their data grows. Neo4j\'s full graph stac

In [9]:
fewshot_examples = """{Input:What are the health benefits for Google employees in the news? Query: Health benefits}
{Input: What is the latest positive news about Google? Query: None}
{Input: Are there any news about VertexAI regarding Google? Query: VertexAI}
{Input: Are there any news about new products regarding Google? Query: new products}
"""

class NewsInput(BaseModel):
    topic: Optional[str] = Field(
        description="Any specific information or topic besides organization, country, and sentiment that the user is interested in. Here are some examples: "
        + fewshot_examples
    )
    organization: Optional[str] = Field(
        description="Organization that the user wants to find information about"
    )
    country: Optional[str] = Field(
        description="Country of organizations that the user is interested in. Use full names like United States of America and France."
    )
    sentiment: Optional[str] = Field(
        description="Sentiment of articles", enum=["positive", "negative"]
    )

class NewsTool(BaseTool):
    name = "Information"
    description = (
        "useful for when you need to answer questions about various actors or movies"
    )
    args_schema: Type[BaseModel] = NewsInput

    def _run(
        self,
        topic: Optional[str] = None,
        organization: Optional[str] = None,
        country: Optional[str] = None,
        sentiment: Optional[str] = None,
        run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> str:
        """Use the tool."""
        return get_organization_news(topic, organization, country, sentiment)

    async def _arun(
        self,
        topic: Optional[str] = None,
        organization: Optional[str] = None,
        country: Optional[str] = None,
        sentiment: Optional[str] = None,
        run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> str:
        """Use the tool asynchronously."""
        return get_organization_news(topic, organization, country, sentiment)

In [10]:
llm = ChatOpenAI(temperature=0, model="gpt-4-turbo", streaming=True)
tools = [NewsTool()]

llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that finds information about movies "
            " and recommends them. If tools require follow up questions, "
            "make sure to ask the user for clarification. Make sure to include any "
            "available options that need to be clarified in the follow up questions "
            "Do only the things the user specifically requested. ",
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

def _format_chat_history(chat_history: List[Tuple[str, str]]):
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer


agent = (
    {
        "input": lambda x: x["input"],
        "chat_history": lambda x: _format_chat_history(x["chat_history"])
        if x.get("chat_history")
        else [],
        "agent_scratchpad": lambda x: format_to_openai_function_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIFunctionsAgentOutputParser()
)

agent_executor = AgentExecutor(agent=agent, tools=tools)

  warn_deprecated(


In [11]:
agent_executor.invoke({"input": "What are some news regarding neo4j?"})

topic: None
Cypher: CYPHER runtime = parallel parallelRuntimeSupport=all MATCH (c:Chunk)<-[:HAS_CHUNK]-(a) WHERE EXISTS {(a)-[:MENTIONS]->(:Organization {name: $organization})} WITH c, a ORDER BY a.date DESC LIMIT toInteger($k) RETURN '#title ' + a.title + '
#date ' + toString(a.date) + '
#text ' + c.text AS output



{'input': 'What are some news regarding neo4j?',
 'output': 'Neo4j, a leader in graph databases and analytics, has recently announced new product integrations with generative AI features in Google Cloud Vertex AI. This collaboration aims to empower enterprise customers by leveraging knowledge graphs for more accurate, transparent, and explainable generative AI outcomes. The integration allows for natural language interaction with knowledge graphs, transformation of unstructured data into structured knowledge graphs, real-time AI enrichment, support for vector embeddings, and grounding capabilities to validate AI responses against enterprise knowledge graphs. This strategic partnership with Google Cloud, initiated in 2019, continues to provide innovative solutions for a wide range of AI use cases.'}

In [12]:
agent_executor.invoke({"input": "What are some positive news regarding neo4j?"})

topic: None
Cypher: CYPHER runtime = parallel parallelRuntimeSupport=all MATCH (c:Chunk)<-[:HAS_CHUNK]-(a) WHERE EXISTS {(a)-[:MENTIONS]->(:Organization {name: $organization})} AND a.sentiment > $sentiment WITH c, a ORDER BY a.date DESC LIMIT toInteger($k) RETURN '#title ' + a.title + '
#date ' + toString(a.date) + '
#text ' + c.text AS output



{'input': 'What are some positive news regarding neo4j?',
 'output': "Here are some positive news regarding Neo4j:\n\n1. **New Product Integrations with Generative AI Features in Google Cloud Vertex AI**:\n   - Neo4j has announced a new product integration with Google Cloud's Vertex AI, which includes generative AI features. This integration allows enterprise customers to leverage knowledge graphs for more accurate, transparent, and explainable AI outcomes. The collaboration highlights Neo4j's capabilities in enhancing AI systems with its graph database and analytics, providing a natural language interface to knowledge graphs, transforming unstructured data, and supporting vector embeddings for large language models.\n\n2. **Recognition at GraphSummit Australia**:\n   - During the 2023 GraphSummit in Australia, Neo4j announced the winners of the 2023 Graphie Awards, recognizing organizations and individuals for outstanding innovation in implementing Neo4j’s graph technology. DXC Techno

In [13]:
agent_executor.invoke({"input": "What are some of the latest negative news about companies from France?"})

topic: None
Cypher: CYPHER runtime = parallel parallelRuntimeSupport=all MATCH (c:Chunk)<-[:HAS_CHUNK]-(a) WHERE EXISTS {(a)-[:MENTIONS]->(:Organization)-[:IN_CITY]->()-[:IN_COUNTRY]->(:Country {name: $country})} AND a.sentiment < $sentiment WITH c, a ORDER BY a.date DESC LIMIT toInteger($k) RETURN '#title ' + a.title + '
#date ' + toString(a.date) + '
#text ' + c.text AS output



{'input': 'What are some of the latest negative news about companies from France?',
 'output': "Here are some of the latest negative news about companies from France:\n\n1. **Vodafone-CK Hutchison's UK Merger Under CMA Scrutiny:**\n   - The merger of Vodafone Group PLC and CK Hutchison Holdings Ltd's UK telecommunication businesses has come under scrutiny by the UK's Competition and Markets Authority (CMA). There are concerns about the merger's approval under the National Security and Investment Act, which allows the British government to intervene in deals posing a risk. The merger faces a tough regulatory path ahead, with potential issues related to competition and consumer choice.\n\n2. **Boksburg Explosion:**\n   - A devastating fuel tanker explosion in Boksburg, Gauteng, resulted in 41 deaths. The truck was transporting liquefied petroleum gas and got stuck under a bridge near the Tambo Memorial Hospital. The investigation is complete, and the Gauteng police are waiting for a deci

In [14]:
agent_executor.invoke({"input": "Is there any information about new products for Apple?"})

topic: new products
Cypher: CYPHER runtime = parallel parallelRuntimeSupport=all MATCH (c:Chunk)<-[:HAS_CHUNK]-(a) WHERE EXISTS {(a)-[:MENTIONS]->(:Organization {name: $organization})} WITH c, a, vector.similarity.cosine(c.embedding,$embedding) AS score ORDER BY score DESC LIMIT toInteger($k) RETURN '#title ' + a.title + '
#date ' + toString(a.date) + '
#text ' + c.text AS output



{'input': 'Is there any information about new products for Apple?',
 'output': 'Apple has been developing a new product called the Vision Pro, which is a mixed-reality (VR/AR) headset. This product is expected to transform the retail experience in Apple Stores, emphasizing a more immersive and personalized environment. The Vision Pro headset will feature advanced technology, including an electric focus drive system that automatically adjusts the headset’s lenses to ensure clear vision for users. Additionally, Apple has involved several suppliers, including Luxshare and other mainland suppliers, to assist in the production of this headset.'}