<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/llm/llama_diffbot_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet llama-index neo4j

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.0 kB[0m [31m20.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.8/176.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
import getpass
from llama_index.core.tools import FunctionTool
from typing import Annotated, List, Optional, Dict
from neo4j import GraphDatabase
from pydantic import Field
from llama_index.core.agent import AgentRunner
from llama_index.llms.openai import OpenAI

In [5]:
graph = GraphDatabase.driver(
    "neo4j+s://diffbot.neo4jlabs.com:7687",
    auth=(getpass.getpass("Diffbot username:"), getpass.getpass("Diffbot password:")),
)

def database_query(query: str, params: Dict = {}, database: str = "companies") -> List[Dict]:
    data = graph.execute_query(query, database_=database, **params)
    return [el.data() for el in data.records]

Diffbot username:··········
Diffbot password:··········


In [6]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

OpenAI API Key:··········


In [7]:
def remove_lucene_chars(text: str) -> str:
    """Remove Lucene special characters"""
    special_chars = [
        "+",
        "-",
        "&",
        "|",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        "\\",
    ]
    for char in special_chars:
        if char in text:
            text = text.replace(char, " ")
    return text.strip()

def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2) to each word, then combines them using the AND
    operator. Useful for mapping movies and people from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()


candidate_query = """
CALL db.index.fulltext.queryNodes($index, $fulltextQuery, {limit: $limit})
YIELD node
RETURN coalesce(node.name, node.fullName) AS candidate,
       node.id AS id,
       labels(node)[0] AS label
ORDER BY node.importance DESC
"""


def get_candidates(input: str, limit: int = 3) -> List[Dict[str, str]]:
    """
    Retrieve a list of candidate entities from database based on the input string.

    This function queries the Neo4j database using a full-text search. It takes the
    input string, generates a full-text query, and executes this query against the
    specified index in the database.
    """
    ft_query = generate_full_text_query(input)
    candidates = database_query(
        candidate_query, {"fulltextQuery": ft_query, "index": "org_name_fulltext", "limit": limit}
    )
    return candidates


In [8]:
get_candidates("Neo4j")

[{'candidate': 'Neo4j',
  'id': 'https://diffbot.com/entity/Ee3Wn9kzHNf2TmXB0gpqx-w',
  'label': 'Organization'},
 {'candidate': 'neo42',
  'id': 'https://diffbot.com/entity/Ezf5spJoqP6qCbL9Ha3iX4w',
  'label': 'Organization'},
 {'candidate': 'Neo4T',
  'id': 'https://diffbot.com/entity/EUjppqZXPMZW3CqsbqRG0KQ',
  'label': 'Organization'}]

In [9]:
info_cypher_query = """
MATCH (o:Organization {id: $org_id})
RETURN apoc.map.removeKey(properties(o), 'embedding') AS properties,
      [(o)-[:PARTNERSHIP]-(r) | r.name][..10] AS partners,
      [(o)-[:HAS_SUBSIDIARY]->(r) | r.name][..10] AS subsidiaries,
      [(o)-[:HAS_SUPPLIER]->(r) | r.name][..10] AS suppliers,
      [(o)-[:HAS_COMPETITOR]-(r) | r.name][..10] AS competitor,
      [(o)-[:BOARD_MEMBER]->(r) | r.name][..10] AS board_members,
      [(o)-[:HAS_CEO]->(r) | r.name][..10] AS ceo
"""


def get_company_information(
    company: str = Field(
        description="The specified company for retrieving the latest news"
    )
) -> str:
    candidates = get_candidates(company, limit=3)
    if not candidates:
        return "No organization found"
    if len(candidates) == 1 or candidates[0]["candidate"] == company:
        org_id = candidates[0]["id"]
    else:
        return ("Which of these organizations do you mean? "
        f"Here are the options: {[el['candidate'] for el in candidates]}")
    info = database_query(info_cypher_query, {"org_id": org_id}, database="companies")
    if not info:
        return "Couldn't find any news"
    return info

info_description = """Useful for getting the information about specific organization."""

info_tool = FunctionTool.from_defaults(get_company_information, description=info_description)

In [10]:
def map_type(type:str) -> str:
    if type == "supplier":
        return "-[:HAS_SUPPLIER]->"
    elif type == "partnership":
        return "-[:PARTNERSHIP]-"
    elif type == "subsidiary":
        return "-[:HAS_SUBSIDIARY]->"
    elif type == "competitor":
        return "-[:HAS_COMPETITOR]-"

def get_latest_news(
    company: str = Field(
        description="The specified company for retrieving the latest news"
    ),
    related_entities: Annotated[
        Optional[str],
        Field(
            default=None,
            description="Specifies whether to include news about entities related to the specified company, such as competitors, suppliers, or partners.",
            enum=["supplier", "partnership", "subsidiary", "competitor"],
        )
    ] = None,
) -> str:
    candidates = get_candidates(company, limit=3)
    if not candidates:
        return "No organization found"
    if len(candidates) == 1 or candidates[0]["candidate"] == company:
        org_id = candidates[0]["id"]
    else:
        return ("Which of these organizations do you mean? "
        f"Here are the options: {[el['candidate'] for el in candidates]}")

    cypher_query = ""
    # Get ids of related entities
    if related_entities:
        cypher_query += f"""CALL () {{
	USE diffbot.companies
  MATCH (o:Organization {{id: $org_id}}){map_type(related_entities)}(related_entities)
        RETURN collect(related_entities.id) AS org_ids
  }} """
    else: # Otherwise just use the org id
        cypher_query += "WITH [$org_id] AS org_ids "

    cypher_query += """
    CALL (org_ids) {
      USE diffbot.articles
        OPTIONAL MATCH (c)<-[:HAS_CHUNK]-(a:Article)-[:HAS_TAG]->(t:Tag)
        WHERE t.id IN org_ids
        RETURN a.title AS title, c.text AS text, t.label AS related_org ORDER BY a.date DESC LIMIT 5
      }
    RETURN title, text, related_org"""
    news = database_query(cypher_query, {"org_id": org_id}, database="diffbot")
    if not news:
        return "Couldn't find any news"
    return news

news_description = """Useful for getting the latest news about specific organization or their related entities."""

news_tool = FunctionTool.from_defaults(get_latest_news, description=news_description)

In [11]:
tools = [news_tool, info_tool]

In [12]:
llm = OpenAI(model="gpt-4")

system_prompt = """
You are a helpful assistant.
Your job is to find information using available tools and then use that information to generate the final answer.
Never answer with any information that isn't provided by the tools!
Always trust the tool output, even if it might not be completely related to question!
If the tool asks for a follow up question, ask the user that!"""

In [13]:
agent = AgentRunner.from_llm(
    llm=llm,
    tools=tools,
    system_prompt=system_prompt,
    verbose=True,
)

In [14]:
response = agent.chat("Who are the board members of Neo4j?")
print(response.response)

Added user message to memory: Who are the board members of Neo4j?
=== Calling Function ===
Calling function: get_company_information with args: {
  "company": "Neo4j"
}
Got output: [{'properties': {'summary': 'Software solution provider', 'isAcquired': False, 'facebookUri': 'facebook.com/neo4j.graph.database', 'totalInvestment': 755100032.0, 'blogUri': 'blog.neo4j.org', 'id': 'https://diffbot.com/entity/Ee3Wn9kzHNf2TmXB0gpqx-w', 'homepageUri': 'neo4j.com', 'description': "Neo Technology is the NOSQL database company for the enterprise. Neo4j is a robust, high performance, scalable graph database. Neo Technology is the industry's only NOSQL database that solves the complex, connected data challenges that enterprises face today. Proven by eight years of 24/7 production use, Neo4j is a fully transactional database, which enables customers, including Adobe and Cisco, to tackle complex data problems. Neo Technology is a privately held company funded by Fidelity Growth Partners Europe, Sunst

In [15]:
response = agent.chat("What's the latest news about Neo4j?")
response.response

Added user message to memory: What's the latest news about Neo4j?
=== Calling Function ===
Calling function: get_latest_news with args: {
  "company": "Neo4j"
}
Got output: [{'title': 'Neo4j Connections: Go From GenAI Pilot to Production Faster With a Knowledge Graph', 'text': 'Without knowledge graphs, you could face massive challenges as you look to transition your GenAI project from pilot to production. Knowledge graphs change all that.\nWith knowledge graphs, your GenAI project can avoid hallucinations and offer better predictions and insights. How do knowledge graphs provide the best, fastest path to move GenAI projects forward? They integrate seamlessly with systems and connect data across sources. They map relationships and improve search.\nWant to learn more? Don’t miss this virtual, half-day event, where industry experts will provide:\nReal-world insights on how to move GenAI from pilot to production\nA look at the enhanced search and data integration capabilities of knowledge

'The latest news about Neo4j is about their upcoming virtual, half-day event titled "Neo4j Connections: Go From GenAI Pilot to Production Faster With a Knowledge Graph". The event will focus on the benefits of using knowledge graphs in GenAI projects, including better predictions, insights, and improved search. The event will feature real-world insights on how to transition GenAI projects from pilot to production, an overview of the enhanced search and data integration capabilities of knowledge graphs, customer success stories, and an overview of how Neo4j knowledge graphs can enhance the accuracy, explainability, and scalability of GenAI projects. Attendees will also have the opportunity to hear how companies are successfully implementing GenAI.'

In [16]:
response = agent.chat("What's the latest news about Neo4j partners?")
response.response

Added user message to memory: What's the latest news about Neo4j partners?
=== Calling Function ===
Calling function: get_latest_news with args: {
  "company": "Neo4j",
  "related_entities": "partnership"
}
Got output: [{'title': 'Vatican and Microsoft Launch AI-Generated St Peter Basilica', 'text': '. Moreover, this digital model allows anyone to “visit” the basilica and discover its rich history from anywhere.\n3D model captures 22 petabytes of data, equal to five million DVDs\nThe highly detailed 3D model, built in partnership with digital preservation firm Iconem, holds a massive 22 petabytes of data—equivalent to about five million DVDs, according to Microsoft’s president.\nThis technology has already uncovered structural issues, like missing mosaic pieces and tiny cracks, which are invisible to the human eye, with speed and accuracy that surpass human abilities.\nPope Francis has encouraged the responsible use of AI. In his annual Peace Message, he called for an international agr

"Here are the latest news about Neo4j's partners:\n\n1. Microsoft has partnered with the Vatican to launch an AI-generated version of St. Peter’s Basilica. This digital model uses smart technology to help visitors explore the basilica and assists the Vatican in managing crowds and spotting areas that may need repair. The project was launched in time for the Vatican’s 2025 Jubilee.\n\n2. Google has rolled out an AI video maker available on select Workspace editions. Google Vids runs on Google’s AI model Gemini to create workplace and marketing videos from Google Drive files and descriptions. Users can start a video from scratch or use a pre-made template.\n\n3. Google's November update for Pixel devices running Android 15 is rolling out now. This update includes bug fixes and the latest security patches.\n\n4. Microsoft is rolling out a new Rewrite option for Notepad to Windows Insiders in Windows 11. The Rewrite option uses an AI model called GPT to revise sentences, modify the tone, o