# Imports and Preprocessing

In [28]:
import os
import dotenv
import json
from openai import AzureOpenAI
from langchain.graphs import Neo4jGraph
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import AzureOpenAIEmbeddings
from neo4j import GraphDatabase

# Setting up AzureAPI and Neo4j Database

Before running this code, make sure you the Neo4j Desktop App installed and have the database imported in 5.17.0 with APOC and GDSL plugins installed.
Your can set your password and username in the Desktop App. 
The dump file can be downloaded from the following link: https://drive.google.com/file/d/1ofXzSIW6a_dGAHJdPnbKN41BUcX4i7Dm/view?usp=drive_link

In [29]:
# Load environment variables
dotenv.load_dotenv("GHD_cred.env", override=True)

# Load Azure OpenAI credentials
resource_name = os.environ.get("AZURE_RESOURCE_NAME")
chat_deployment_name = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT")
embedding_deployment_name = os.environ.get("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
api_key = os.environ.get("AZURE_OPENAI_API_KEY")
api_version = "2023-12-01-preview"
endpoint = f"https://{resource_name}.openai.azure.com"
api_url = f"https://{resource_name}.openai.azure.com/openai/deployments/{chat_deployment_name}/chat/completions?api-version={api_version}"

# Load Neo4j credentials
username = os.environ.get("NEO4J_USERNAME")
password = os.environ.get("NEO4J_PASSWORD")
url = os.environ.get("NEO4J_URI")

# Creating AzureOpneAI client
client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=endpoint,
)

# Creating Neo4jGraph

graph = Neo4jGraph(url=url, username=username, password=password)

# Create Chunks Text Embeddings and Entities

In [30]:
driver = GraphDatabase.driver(url, auth=(username, password), max_connection_lifetime=200)

# Creating vector indexes
vector_index = Neo4jVector.from_existing_graph(
    AzureOpenAIEmbeddings(azure_deployment=embedding_deployment_name,
    openai_api_version=api_version, azure_endpoint=endpoint),
    url=url,
    username=username,
    password=password,
    node_label="Part",
    text_node_properties=['title'],
    embedding_node_property='embedding',
)

vector_index = Neo4jVector.from_existing_graph(
    AzureOpenAIEmbeddings(azure_deployment=embedding_deployment_name,
    openai_api_version=api_version, azure_endpoint=endpoint),
    url=url,
    username=username,
    password=password,
    node_label="SubPart",
    text_node_properties=['title'],
    embedding_node_property='embedding',
)

vector_index = Neo4jVector.from_existing_graph(
    AzureOpenAIEmbeddings(azure_deployment=embedding_deployment_name,
    openai_api_version=api_version, azure_endpoint=endpoint),
    url=url,
    username=username,
    password=password,
    node_label="Section",
    text_node_properties=['title'],
    embedding_node_property='embedding',
)

vector_index = Neo4jVector.from_existing_graph(
    AzureOpenAIEmbeddings(azure_deployment=embedding_deployment_name,
    openai_api_version=api_version, azure_endpoint=endpoint),
    url=url,
    username=username,
    password=password,
    node_label="Chunks",
    text_node_properties=['text'],
    embedding_node_property='embedding',
)

entity_types = {
    "Part": "Segment of a subchapter, detailing more specific topics, regulations, agencies and guidelines.",
    "Subpart": "Further division of a part, detailing very specific aspects or regulations and topic of interest.",
    "Section": "The most granular division, often representing specific area in individual regulations or guidelines.",
    "Section_Formula": "Include formulas related to enviornmental regulations mentioned in the corresponding section and includes their explanation in extraction.",
    "Chunks": "Chunks of texts of the corresponding section, provide most detailed information in regards regulation or guidelines."
}

relation_types = {
    "HAS_SUBPART": "A part contains one or more subparts.",
    "HAS_SECTION": "A subpart contains one or more sections.",
    "HAS_IMAGE": "A section contains one or more formulas.",
    "HAS_TEXT": "A section contains one or more chunks of texts."
}

entity_relationship_match = {
    "Part": "HAS_SUBPART",
    "Subpart": "HAS_SECTION",
    "Section": ["HAS_IMAGE", "HAS_TEXT"]
}

# Question Prompt 

In [31]:
system_question_prompt = f"""
    You are a helpful agent designed to fetch information from a graph database structured around environmental regulations.

    The graph database organizes regulations into the following entity types:
    {json.dumps(entity_types, indent=0)}

    Each entity is connected through one of the following hierarchical relationships:
    {json.dumps(relation_types, indent=0)}

    Depending on the user prompt, determine if it is possible to answer with the graph database.

    The graph database can navigate through multiple layers of hierarchy to find specific sections of regulations.

    Example user input:
    "We have a continuous emission monitoring system for our acid gas units but no vent meter, what calculation method should we use?"

    There are multiple layers to analyse:
    1. The mention of "continuous emission monitoring system" indicates what subject matter the prompt is asking for.
    2. The mention of "what calculation method" indicates the action we want to perform on the subject matter.
    3. The mention of "acid gas units no vent meter" provides additional conditions to the subject matter to consider.


    Return a json object following these rules:
    For each layer of the hierarchy or specific query parameter mentioned, add a key-value pair with the key being a match for one of the entity types provided, and the value being the relevant detail from the user query.

    For the example provided above, the expected output would be:
    {{
        "subject" : "continuous emission monitoring system",
        "to_do" : "what calculation method",
        "clarification" : "acid gas units no vent meter"
    }}

    If there are no relevant entities or layers in the user prompt, return an empty json object.
"""

# Define functions for Question Interpretation and Result Mapping 

In [32]:
def define_query(prompt, model="chat35"):
    """
    This function defines a query to the Azure OpenAI chat model
    and return its interpretation of the prompt with desired output format.
    """

    completion = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": system_question_prompt},
            {"role": "user", "content": prompt},
        ],
    )
    return completion.choices[0].message.content


def create_embedding(text):
    """
    This function creates an embedding for a given text using the Azure OpenAI Embedding model.
    """

    result = client.embeddings.create(model=embedding_deployment_name, input=text)
    return result.data[0].embedding


def create_query(text, threshold=0.8):
    """
    This function creates a Cypher query to find matching sections in the graph database
    """

    query_data = json.loads(text)

    # Creating embeddings
    embeddings_data = []
    for key, val in query_data.items():
        embeddings_data.append(f"${key}Embedding AS {key}Embedding")
    query = "WITH " + ",\n".join(e for e in embeddings_data)
    query += "\nMATCH (c:Chunks)"
    # Find matching
    similarity_data = []
    for key, val in query_data.items():
        similarity_data.append(
            f"gds.similarity.cosine(c.embedding, {key}Embedding) > {threshold}"
        )
    query += "\nWHERE "
    query += " OR ".join(e for e in similarity_data)
    query += "\nRETURN c.text, ID(c)"
    return query


def query_graph(prompt, model="chat35"):
    """
    This function queries the graph database to find matching sections based on the user prompt.
    """

    response = define_query(prompt, model)
    embeddingsParams = {}
    query = create_query(response)
    query_data = json.loads(response)

    for key, val in query_data.items():
        embeddingsParams[f"{key}Embedding"] = create_embedding(val)
    result = graph.query(query, params=embeddingsParams)
    print(f"Found {len(result)} matching chunks\n")

    return result

# Define functions for Answer Generation and Response System Prompt

In [33]:
def define_response_query(result, question, model="chat35"):
    """
    This function inputs the matched section texts and return its interpretation of the prompt with desired output format.
    """

    """
    Concatenate the chunked text
    """

    result_text = ""
    for res in result[:80]:
        result_text += res["c.text"] + " "

    """
    Getting the formulas mentioned in the sections
    """

    section_ids = [res["ID(c)"] for res in result[:80]]
    section_id_query = f" MATCH (c:Chunks) WHERE id(c) = {section_ids[0]}"
    for id in section_ids[1:]:
        section_id_query += f" OR id(c) = {id}"

    section_id_query += f" MATCH (s:Section)-[:HAS_TEXT]->(c) RETURN ID(s)"
    section_id_result = graph.query(section_id_query)
    section_id_search = [res["ID(s)"] for res in section_id_result]
    section_id_search = list(set(section_id_search))

    formula_query = f"MATCH (s:Section)-[:HAS_FORMULA]->(f:Formula) WHERE ID(s) = {section_id_search[0]}"
    for id in section_id_search[1:]:
        formula_query += f" OR ID(s) = {id}"
    formula_query += " RETURN f.extraction, f.content"
    formula_result = graph.query(formula_query)

    formula_results_final = []
    for res in formula_result[:80]:
        formula_results_final.append(res["f.content"])
        formula_results_final.append(res["f.extraction"])

    formula_results_final = str(formula_results_final)

    """
    Generating summaries and analysis of the tables
    """

    table_query = f"MATCH (s:SubPart)-[:HAS_TABLE]->(t:Table) WHERE ID(s) = 4205 RETURN t.id,t.content"
    table_result = graph.query(table_query)

    table_results_final = []
    for res in table_result:
        table_results_final.append(res["t.id"])
        table_results_final.append(res["t.content"])

    table_results_final = str(table_results_final)

    table_analysis_system_prompt = f"""
    You are a helpful agent designed to fetch information from a graph database structured around environmental regulations. 
    Here is the question asked: {question}
    Given the following input, which includes XML of tables, provide a comprehensive explanation of the tables.
    Present your analysis in a structured JSON object format under the key "result".
    The expected example output format would be:
    {{
        "result" : "The table describes how many devices should be registered for various categories."
    }}
    """

    table_analysis = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": table_analysis_system_prompt},
            {
                "role": "user",
                "content": table_results_final,
            },
        ],
    )

    table_analysis_result = table_analysis.choices[0].message.content

    """
    Generating summaries and anlysis of the chunked texts
    """

    text_analysis_system_prompt = f"""
    You are an intelligent agent tasked with analyzing information from a graph database on environmental regulations.
    A question has been posed: "{question}"
    Based on the following text extracted from the database, analyze and summarize the content to answer the question with the following steps:
    1. Look for any numbers that are mentioned in the content result_text, they may be key to answering the question.
    2. Look for any enviornmental terms in the result text that closely resembles the question.
    3. Pay attention to any calculation, measures, methods mentioned in result_text if asked in the question, return or calculate with them. 
    Your response should be structured as a JSON object, encapsulating the summary and analysis relevant to the question:
    Expected example output format:
    {{
        "result": "The document mentioned several calucuation method which explains how to derive the emission from the data."
    }}
    """

    text_analysis = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": text_analysis_system_prompt},
            {
                "role": "user",
                "content": result_text,
            },
        ],
    )

    text_analysis_result = text_analysis.choices[0].message.content

    """
    Generating summaries and anlysis of the formulas
    """

    formulas_analysis_system_prompt = f"""
    You are a helpful agent designed to fetch information from a graph database structured around environmental regulations. 
    Here is the question asked: {question}
    Given the following input, which includes LaTeX equations and its extraction or explanation, provide a comprehensive explanation of the LaTeX formulas. 
    Present your analysis in a structured JSON object format under the key "result".
    The expected example output format would be:
    {{
        "result" : "The equation describes E = COUNT * EF ....... The extraction explains that COUNT is the number of units and EF is the emission factor."
    }}
    """

    formulas_analysis = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": formulas_analysis_system_prompt},
            {
                "role": "user",
                "content": formula_results_final,
            },
        ],
    )

    formulas_analysis_result = formulas_analysis.choices[0].message.content

    """
    Ingeration of the results
    """

    intergate_text = (
        json.loads(text_analysis_result)["result"]
        + formulas_analysis_result
        + table_analysis_result
    )

    final_analysis_system_prompt = f"""
    You are a helpful agent designed to fetch information from a graph database structured around environmental regulations. 
    Here is the question asked: {question}
    Given the following input, which includes the analysis of the text, formulas and relevant tables, provide a comprehensive explanation of the givien text to answer the question. 
    Output the final analysis in a structured JSON object format under the key "result".
    {{
        "result" : "The document mentioned several calucuation method which explains how to derive the emission from the data. The equation describes E = COUNT * EF ....... The extraction explains that COUNT is the number of units and EF is the emission factor."
    }}
    """

    final_analysis = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": final_analysis_system_prompt},
            {
                "role": "user",
                "content": intergate_text,
            },
        ],
    )

    integrate_response = final_analysis.choices[0].message.content

    return integrate_response

# Define Question and Testing

In [34]:
# Input the example prompt to test the chatbot

## Example prompt:
## Do I have to count the number of pneumatic devices we have?
## What are my pneumatic device emissions? I have 100 high bleed devices.
## What is the emission reporting limit?
## What is the emission factor for an intermittent bleed device?
## We have a CEMS for our acid gas units but no vent meter, what calculation method should we use?
## How many calculation methods are available for G&B storage tanks?


example_question = "What are my pneumatic device emissions? I have 100 high bleed devices."

def chat_main(question):

    '''
    This function is the main function to run the chatbot.
    '''

    result = query_graph(question)
    inter_result = define_response_query(result,question)
    print("Analyzed Results:",json.loads(inter_result)['result'])
    return json.loads(inter_result)['result']

chat_result = chat_main(example_question)


Found 121 matching chunks

Analyzed Results: The document provides detailed information about pneumatic device emissions, including high-bleed pneumatic devices, and the calculation of CH4 and CO2 volumetric emissions from continuous high bleed, continuous low bleed, and intermittent bleed natural gas pneumatic devices using Equation W–1. It also mentions the emissions from blowdown vent stacks and the calculation of emissions from blowdown valves and isolation valves on compressors. Therefore, based on the information provided, the emissions from 100 high-bleed pneumatic devices can be calculated using Equation W–1 and the relevant calculation methods mentioned in the document. The equation W-1 describes the calculation of CH4 and CO2 volumetric emissions from pneumatic devices, and it involves factors such as the number of units and the emission factor. The emission factor for high continuous bleed pneumatic device vents is 37.3 in the Eastern U.S. and 1.39 in the Western U.S. Theref