# Q&A bot using LLM and Neo4j 

##### Sample notebook to generate Cypher queries and create text responses.

##### User Query -> LLM (to generate cypher) -> Query Neo4j KG -> LLM (to generate response using the KG result).

##### Author: Tanmayi Balla(tballa@iu.edu)

In [28]:
import os
import re
import openai
import time
import json
import datetime
import timeout_decorator
from langchain.text_splitter import RecursiveCharacterTextSplitter
import secret
from neo4j import GraphDatabase
from neo4j.exceptions import CypherSyntaxError
import openai

In [29]:
openai.api_key = secret.openai_api_key # "<Your API-KEY>"
#model = "gpt-4" # "gpt-3.5-turbo"
model = "gpt-3.5-turbo"

In [30]:
#url="neo4j+s://88f69104.databases.neo4j.io"
#user="neo4j"
#password=""
url = "neo4j+s://linguistic.technology:7687"
user = "l715"
password = "frozen-sharp-darwin-sponsor-weekend-6115"

driver = GraphDatabase.driver(url, auth=(user, password))

In [4]:
node_properties_query = """
call db.schema.nodeTypeProperties() yield nodeLabels,propertyName 
WITH nodeLabels AS nodeLabels, collect(propertyName) AS properties
RETURN {labels: nodeLabels, properties: properties} AS output
"""

rel_properties_query = """
call db.schema.relTypeProperties() yield relType,propertyName 
WITH relType AS relType, collect(propertyName) AS properties
RETURN {labels: relType, properties: properties} AS output
"""

rel_query = """
MATCH (first)-[r]->(second)
RETURN DISTINCT {source:head(labels(first)),
       relationship:type(r),
       target:head(labels(second))} AS output
"""


In [5]:
def query_neo4j_db(cypher_query, params={}):
    with driver.session() as session:
        result = session.run(cypher_query, params)
        output = [r.values() for r in result]
        output.insert(0, result.keys())
        return output

def schema_text(node_props, rel_props, rels):
    return f"""
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  {node_props}
  Relationship properties are the following:
  {rel_props}
  Relationship point from source to target nodes
  {rels}
  Make sure to respect relationship types and directions
  """

def generate_schema():
    node_props = query_neo4j_db(node_properties_query)
    rel_props = query_neo4j_db(rel_properties_query)
    rels = query_neo4j_db(rel_query)
    return schema_text(node_props, rel_props, rels)


In [58]:
schema = generate_schema()

def create_cypher(query, history = None):
    
    sys_message = f"""
        Task: Generate Cypher queries to query a Neo4j graph database based on the provided schema definition.
        Instructions:
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        Note: Do not include any explanations or apologies in your responses.
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema:
        {schema}

        Note: Do not include any explanations or apologies in your responses.
        """
    
    messages = [
        {"role": "system", "content": sys_message},
        {"role": "user", "content": query},
    ]
    if history:
        messages.extend(history)

    response = openai.ChatCompletion.create(
        model=model,
        temperature=0.0,
        max_tokens=1000,
        messages=messages
    )
    return response.choices[0].message.content


In [59]:
def run_pipeline(query, history = None, retry = True):
    cypher = create_cypher(query)
    print(cypher)
    try:
        return query_neo4j_db(cypher)
    except CypherSyntaxError as e:
        if not retry:
          return "Invalid Cypher syntax"
        print("Retrying")
        return run_pipeline(
            query,
            [
                {"role": "assistant", "content": cypher},
                {
                    "role": "user",
                    "content": f"""This query returns an error: {str(e)} 
                    Give me a improved query that works without any explanations or apologies""",
                },
            ],
            retry=False
        )

In [31]:
sample_res = run_pipeline("""
In which location is Apple Inc. incorporated?
""")

MATCH (c:Corporation {name: 'Apple Inc.'})-[:INCORPORATED_IN]->(s:State)
RETURN s.name as location


In [60]:
sample_res = run_pipeline("""
What are the headquarters of the companies: Apple Inc., ELI LILLY & Co, BRISTOL MYERS SQUIBB CO, and Samsung?
""")

MATCH (c:Corporation)
WHERE c.name IN ['Apple Inc.', 'ELI LILLY & Co', 'BRISTOL MYERS SQUIBB CO', 'Samsung']
MATCH (c)-[:INCORPORATED_IN]->(s:State)
RETURN c.name AS Company, s.name AS Headquarters


In [61]:
sample_res

[['Company', 'Headquarters'],
 ['ELI LILLY & Co', 'Indiana'],
 ['BRISTOL MYERS SQUIBB CO', 'Delaware'],
 ['Apple Inc.', 'California']]

In [32]:
sample_res

[['location'], ['California']]

In [59]:
sample_res = run_pipeline("""
Which organizations partnered with Rocket Lab?
""")

MATCH (o:Organization)-[:Partnered_with]->(p:Organization {name: 'Rocket Lab'}) 
RETURN o.name


In [60]:
sample_res

[['o.name'], ['Silicon Valley Bank'], ['Hercules Capital, Inc.']]

In [33]:
# system_dbtotext = f"""
# You are an assistant that helps to generate text to form nice and human understandable answers based.
# The latest prompt contains the information, and you need to generate a human readable response based on the given information.
# Make it sound like the information are coming from an AI assistant, but don't add any information.
# Do not add any additional information that is not explicitly provided in the latest prompt.
# I repeat, do not add any information that is not explicitly given.
# """

system_dbtotext = f"""
You are an assistant that helps to generate text to form nice and human understandable answers based.
The latest prompt contains the results retrieved from the database, and you need to generate a human readable response using only the results.
Make it sound like the information are coming from an AI assistant, but don't add any information.
If the results retrived are not related to the user query, they simple say that you don't know the answer and the results doesn't match with the same.
Do not add any additional information that is not explicitly provided in the latest prompt.
I repeat, do not add any information that is not explicitly given.
"""


def generate_response(query, data):
    prompt = "User query: " + str(query) + "Graph database result: " + str(data)
    messages = [
        {"role": "system", "content": system_dbtotext},
        {"role": "user", "content": prompt},
    ] 

    completions = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0.0
    )
    response = completions.choices[0].message.content
    # If the model apologized, remove the first line or sentence
    if "apologi" in response:
        if "\n" in response:
            response = " ".join(response.split("\n")[1:])
        else:
            response = " ".join(response.split(".")[1:])
    return response

In [34]:
query = "In which location is Apple Inc. incorporated?"
data = sample_res[1:]
print(generate_response(query, data))

Apple Inc. is incorporated in California.


In [62]:
query = "What are the headquarters of the companies: Apple Inc., ELI LILLY & Co, BRISTOL MYERS SQUIBB CO, and Samsung?"
data = sample_res[1:]
print(generate_response(query, data))

The headquarters of the companies you mentioned are as follows: ELI LILLY & Co is headquartered in Indiana, BRISTOL MYERS SQUIBB CO is headquartered in Delaware, and Apple Inc. is headquartered in California. Unfortunately, I don't have information about the headquarters of Samsung.


In [86]:
query = "Which organizations partnered with Rocket Lab?"
data = sample_res[1:]
print(generate_response(query, data))

Rocket Lab has partnered with two organizations: Silicon Valley Bank and Hercules Capital, Inc.


In [90]:
query = "Who invented telephone?"
data = sample_res[1:]
print(generate_response(query, data))

I'm sorry, but I couldn't find any relevant information about the invention of the telephone in the graph database. It seems that the results retrieved are not related to your query.
