In [1]:
import os
import dotenv
import json
from openai import AzureOpenAI
from langchain.graphs import Neo4jGraph
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import AzureOpenAIEmbeddings
from neo4j import GraphDatabase
import time
# Load environment variables
dotenv.load_dotenv("GHD_cred.env", override=True)

# Load Azure OpenAI credentials
resource_name = os.environ.get("AZURE_RESOURCE_NAME")
chat_deployment_name = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT")
embedding_deployment_name = os.environ.get("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
api_key = os.environ.get("AZURE_OPENAI_API_KEY")
api_version = "2023-12-01-preview"
endpoint = f"https://{resource_name}.openai.azure.com"
api_url = f"https://{resource_name}.openai.azure.com/openai/deployments/{chat_deployment_name}/chat/completions?api-version={api_version}"

# Load Neo4j credentials
username = 'neo4j'
password = 'abcdefgh'
url = 'bolt://localhost:7687'

# Creating AzureOpneAI client
client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=endpoint,
)

# Creating Neo4jGraph

graph = Neo4jGraph(url=url, username=username, password=password)
driver = GraphDatabase.driver(url, auth=(username, password), max_connection_lifetime=200)

# Creating vector indexes
vector_index = Neo4jVector.from_existing_graph(
    AzureOpenAIEmbeddings(azure_deployment=embedding_deployment_name,
    openai_api_version=api_version, azure_endpoint=endpoint),
    url=url,
    username=username,
    password=password,
    node_label="Part",
    text_node_properties=['title'],
    embedding_node_property='embedding',
)

vector_index = Neo4jVector.from_existing_graph(
    AzureOpenAIEmbeddings(azure_deployment=embedding_deployment_name,
    openai_api_version=api_version, azure_endpoint=endpoint),
    url=url,
    username=username,
    password=password,
    node_label="SubPart",
    text_node_properties=['title'],
    embedding_node_property='embedding',
)

vector_index = Neo4jVector.from_existing_graph(
    AzureOpenAIEmbeddings(azure_deployment=embedding_deployment_name,
    openai_api_version=api_version, azure_endpoint=endpoint),
    url=url,
    username=username,
    password=password,
    node_label="Section",
    text_node_properties=['title'],
    embedding_node_property='embedding',
)

vector_index = Neo4jVector.from_existing_graph(
    AzureOpenAIEmbeddings(azure_deployment=embedding_deployment_name,
    openai_api_version=api_version, azure_endpoint=endpoint),
    url=url,
    username=username,
    password=password,
    node_label="Chunks",
    text_node_properties=['text'],
    embedding_node_property='embedding',
)

entity_types = {
    "Part": "Segment of a subchapter, detailing more specific topics, regulations, agencies and guidelines.",
    "Subpart": "Further division of a part, detailing very specific aspects or regulations and topic of interest.",
    "Section": "The most granular division, often representing specific area in individual regulations or guidelines.",
    "Section_Formula": "Include formulas related to enviornmental regulations mentioned in the corresponding section and includes their explanation in extraction.",
    "Chunks": "Chunks of texts of the corresponding section, provide most detailed information in regards regulation or guidelines."
}

relation_types = {
    "HAS_SUBPART": "A part contains one or more subparts.",
    "HAS_SECTION": "A subpart contains one or more sections.",
    "HAS_IMAGE": "A section contains one or more formulas.",
    "HAS_TEXT": "A section contains one or more chunks of texts."
}

entity_relationship_match = {
    "Part": "HAS_SUBPART",
    "Subpart": "HAS_SECTION",
    "Section": ["HAS_IMAGE", "HAS_TEXT"]
}

In [2]:
system_question_prompt = f"""
You're assisting users in understanding environmental regulations by querying a structured graph database. 

The graph database organizes regulations into the following entity types:
{json.dumps(entity_types, indent=0)}

Each entity is connected through one of the following hierarchical relationships:
{json.dumps(relation_types, indent=0)}

Depending on the user input, determine if it is possible to answer with the graph database.

The graph database can navigate through multiple layers of hierarchy to find specific sections of regulations.

We have 3 user inputs. Let's break them down.

Example user input 1:
"How does EPA define a ‘facility’ for petroleum and natural gas systems?"

Analysis breakdown:
1. The phrase "for petroleum and natural gas systems" specifies the subject matter of the inquiry.
2. The question "How does EPA define a ‘facility’" indicates the desired action related to the subject matter.
3. The term "EPA define" provides additional context or conditions to consider.

Your task is to generate a JSON object structured as follows:
- "subject": Specify the subject matter identified in the user input.
- "action_requested": Describe the action the user wants to perform.
- "clarification": Provide any additional context or conditions mentioned in the user input to narrow down search space and return specific and relevant answer.

For the given example, the expected output would be:
{{
    "subject": "for petroleum and natural gas systems",
    "action_requested": "How does EPA define a ‘facility’",
    "clarification": "EPA define"
}}

Example user input 2:
"What are my pneumatic device emissions? I have 100 high bleed devices."

There are multiple layers to analyze:
1. The mention of "pneumatic device emissions" indicates what subject matter the prompt is asking for.
2. The mention of "What are" indicates the action we want to perform on the subject matter.
3. The mention of "100 high bleed devices" provides additional conditions to the subject matter to consider.

Return a JSON object following these rules:
For each layer of the hierarchy or specific query parameter mentioned, add a key-value pair with the key being a match for one of the entity types provided, and the value being the relevant detail from the user query.

For the example provided above, the expected output would be:
{{
    "subject": "pneumatic device emissions",
    "action_requested": "What are",
    "clarification": "100 high bleed devices"
}}

Example user input 3:
"Which calculation from 98.233(o) should be used for centrifugal compressor venting at onshore petroleum and natural gas production facilities?"

There are multiple layers to analyze:
1. The mention of "centrifugal compressor venting at onshore petroleum and natural gas production facilities" indicates what subject matter the prompt is asking for.
2. The mention of "Which calculation" indicates the action we want to perform on the subject matter.
3. The mention of " from 98.233(o)" provides additional conditions to the subject matter to consider.

Return a JSON object following these rules:
For each layer of the hierarchy or specific query parameter mentioned, add a key-value pair with the key being a match for one of the entity types provided, and the value being the relevant detail from the user query.

For the example provided above, the expected output would be:
{{
    "subject": "centrifugal compressor venting at onshore petroleum and natural gas production facilities",
    "action_requested": "Which calculation",
    "clarification": "from 98.233(o)"
}}

If there are no relevant entities or layers in the user prompt, return an empty JSON object.
"""
def define_query(prompt, model="chat35"):
    """
    This function defines a query to the Azure OpenAI chat model
    and return its interpretation of the prompt with desired output format.
    """

    completion = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": system_question_prompt},
            {"role": "user", "content": prompt},
        ],
    )
    return completion.choices[0].message.content


In [3]:
def create_embedding(text):
    """
    This function creates an embedding for a given text using the Azure OpenAI Embedding model.
    """

    result = client.embeddings.create(model=embedding_deployment_name, input=text)
    return result.data[0].embedding

In [4]:
def create_query(text, threshold=0.8):
    """
    This function creates a Cypher query to find matching sections in the graph database
    """

    query_data = json.loads(text)

    # Creating embeddings
    embeddings_data = []
    for key, val in query_data.items():
        embeddings_data.append(f"${key}Embedding AS {key}Embedding")
    query = "WITH " + ",\n".join(e for e in embeddings_data)
    query += "\nMATCH (c:Chunks)"
    # Find matching
    similarity_data = []
    for key, val in query_data.items():
        similarity_data.append(
            f"gds.similarity.cosine(c.embedding, {key}Embedding) > {threshold}"
        )
    query += "\nWHERE "
    query += " OR ".join(e for e in similarity_data)
    query += "\nRETURN c.text, ID(c), " + ", ".join(f"gds.similarity.cosine(c.embedding, {key}Embedding) AS similarity_score_{key}" for key in query_data.keys())
    return query

def query_graph(prompt, model="chat35"):
    """
    This function queries the graph database to find matching sections based on the user prompt.
    """

    response = define_query(prompt, model)
    embeddingsParams = {}
    query = create_query(response)
    query_data = json.loads(response)

    for key, val in query_data.items():
        embeddingsParams[f"{key}Embedding"] = create_embedding(val)
    result = graph.query(query, params=embeddingsParams)
    # Sort the chunks based on similarity scores in descending order
    # Dynamically generate the key for sorting based on query_data keys
    result = sorted(result, key=lambda x: x[f"similarity_score_{list(query_data.keys())[0]}"], reverse=True)

    print(f"Found {len(result)} matching chunks\n")

    return result


In [69]:
def define_response_query(result, question, model="chat35"):
    """
    This function inputs the matched section texts and returns its interpretation of the prompt with desired output format.
    """
    
    """
    Concatenate the chunked text
    """

    result_text = ""
    for res in result[:50]: #number subject to change, can write another user input function. when # of matching chunks is to large, might be running out of token
        result_text += res["c.text"] + " "
    

    """
    Perform text analysis
    """
    text_analysis_system_prompt = f"""
    You are an intelligent agent tasked with analyzing "{result_text}" extracted from a graph database on environmental regulations.
    A question has been posed "{question}"
    Analyze the "{result_text}" and directly give human-like answer to the question based on the relevant information with the following steps:
    1. Identify any environmental terms or keywords within the content "{result_text}" that closely correspond to the question. These terms can guide you towards the relevant information needed for the analysis.
    2. Identify any numbers mentioned in the content "{result_text}". These numbers could be crucial for answering the question. If they are pertinent to the question, ensure to incorporate them into your answer.
    3. Please pay attention to all relevant factors when making your decision. Consider all available options before providing your response. Ensure you analyze all the related information, before making a decision.
    4. If the question involves any calculations, measures, or methods, ensure to address them in your answer.
    5. Additionally, you are required to directly provide a clear and definitive yes or no response based on your analysis to answer the question, if necessary. 

 
    Your response should be structured as a JSON object

    First example question would be:
    "What gases must be reported by oil and natural gas system facilities?"
    
    The answer you are expected to generate should look like:
    {{
        "result": "Summary of Source Types by Industry Segment.  Each facility must report:• Carbon Dioxide (CO2) and methane (CH4) emissions from equipment leaks and vented emissions.  The table below identifies each source type that industry segments are required to report.  For example, natural gas processing facilities must report emissions from seven specific source types, and underground storage must report for five source types.• CO2, CH4, and nitrous oxide (N2O) emissions from gas flares by following the requirements of subpart W.• CO2, CH4, and N2O emissions from stationary and portable fuel combustion sources in the onshore production industry segment following the requirements in subpart W.• CO2, CH4, and N2O emissions from stationary combustion sources in the natural gas distribution industry segment following the requirements in subpart W.• CO2, CH4, and N2O emissions from all other applicable stationary combustion sources following the requirements of 40 CFR 98 subpart C (General Stationary Fuel Combustion Sources)."
    }}
    
    Second example question would be:
    "My question concerns the calculation of standard temperature and pressure.  The rule stipulates what standard temperature and pressure are, but how, for an annual average, is actual temperature and pressure defined?"
    
    The answer you are expected to generate should look like:
    {{
        "result": "Actual temperature and pressure as defined for §98.233 is the “average atmospheric conditions or typical operating conditions.”  Therefore, the average temperature and pressure at a given location based on annual averages can be used for actual temperature and actual pressure.
    }}
    """

    text_analysis = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": text_analysis_system_prompt},
            {
                "role": "user",
                "content": result_text,
            },
        ],
    )

    text_analysis_result = text_analysis.choices[0].message.content

    """
    Perform table analysis
    """
    table_query = f"MATCH (s:SubPart)-[:HAS_TABLE]->(t:Table) WHERE ID(s) = 4205 RETURN t.id,t.content"
    table_result = graph.query(table_query)
    table_results_final = []
    for res in table_result:
        table_results_final.append(res["t.id"])
        table_results_final.append(res["t.content"])

    table_results_final = str(table_results_final)
    
    table_analysis_system_prompt = f"""
    You are a helpful agent designed to analyze the table information provided.
    Here is the question asked: {question}
    Based on your analysis, directly give a human-like answer to the question based on the relevant information.
    If the question involves any calculations, measures, or methods, ensure to address them in your answer.
    
    Present your analysis in a structured JSON object format under the key "result".
    
    The example question would be:
    "Are the emissions factors listed in Table W-1A for both leaking components and non-leaking components? How do you calculate emissions from leaking components if onshore petroleum and natural gas source are not required to monitor components?"
    
    The answer you are expected to generate should look like:
    {{
        "result" : "Equipment leak emissions in onshore production are to be estimated using methods provided in 98.233(r)(2).  Hence, no leak detection of emissions is required for onshore production.  Table W-1A provides population emission factors, which represent the emissions on an average from the entire population of components – both leaking and non-leaking; please see section 6(d) of the Technical Support Document (http://www.epa.gov/ghgreporting/documents/pdf/2010/Subpart-W_TSD.pdf) for further details on the concept of population emission factors."
    }}
    """

    table_analysis = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": table_analysis_system_prompt},
            {
                "role": "user",
                "content": table_results_final,
            },
        ],
    )
    
    table_analysis_result = table_analysis.choices[0].message.content
    
    """
    Getting the formulas mentioned in the sections
    """

    section_ids = [res["ID(c)"] for res in result[:80]]
    section_id_query = f" MATCH (c:Chunks) WHERE id(c) = {section_ids[0]}"
    for id in section_ids[1:]:
        section_id_query += f" OR id(c) = {id}"

    section_id_query += f" MATCH (s:Section)-[:HAS_TEXT]->(c) RETURN ID(s)"
    section_id_result = graph.query(section_id_query)
    section_id_search = [res["ID(s)"] for res in section_id_result]
    section_id_search = list(set(section_id_search))

    formula_query = f"MATCH (s:Section)-[:HAS_FORMULA]->(f:Formula) WHERE ID(s) = {section_id_search[0]}"
    for id in section_id_search[1:]:
        formula_query += f" OR ID(s) = {id}"
    formula_query += " RETURN f.extraction, f.content"
    formula_result = graph.query(formula_query)
    formula_results_final = []
    for res in formula_result[:80]:
        formula_results_final.append(res["f.content"])
        formula_results_final.append(res["f.extraction"])

    formula_results_final = str(formula_results_final)
    
    """
    Perform formula analysis
    """
    
    formulas_analysis_system_prompt = f"""
    You are a helpful agent designed to analyze the Latex math formula information provided.
    Here is the question asked: {question}
    Based on your analysis, directly give a human-like answer to the question based on the relevant information.
    If the question involves any calculations, measures, or use of math formulas, ensure to address them in your answer.
    
    Present your analysis in a structured JSON object format under the key "result".
    
    The example question would be:
    "What are my pneumatic device emissions? I have 100 high bleed devices."
    
    The answer you are expected to generate should look like:
    {{
        "result" : "To calculate CH4 and CO2 volumetric emissions from natural gas driven pneumatic pump venting , we need to use Equation W-2 of §98.233 where Es,i = Annual total volumetric GHG emissions at standard conditions in standard cubic feet per year from all natural gas driven pneumatic pump venting, for GHGi. Count = Total number of natural gas driven pneumatic pumps. EF = Population emissions factors for natural gas driven pneumatic pumps (in standard cubic feet per hour per pump) listed in Table W-1A of this subpart for onshore petroleum and natural gas production and onshore petroleum and natural gas gathering and boosting facilities. GHGi = Concentration of GHGi, CH4, or CO2, in produced natural gas as defined in paragraph (u)(2)(i) of this section. T = Average estimated number of hours in the operating year the pumps were operational using engineering estimates based on best available data. Default is 8,760 hours.The the following assumptions will be made to complete the calculation: 1) T will be the default value 86 hours. 2) The concentration of GHGi is 95% for CH4 and 1% for CO2. EF for High Continuous Bleed Pneumatic Device Vents is 37.3 scf/hour/component. Your pneumatic device emissions would be 31,041,060 scf CH4 which is calculated as 100 * 37.3 scf / hr / device * 0.95 * 8760 hours and 326,748 scf CO2 which is calculated as 100 * 37.3 scf / hr / device * 0.01 * 8760 hours."
    }}
    """

    formulas_analysis = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": formulas_analysis_system_prompt},
            {
                "role": "user",
                "content": formula_results_final,
            },
        ],
    )

    formulas_analysis_result = formulas_analysis.choices[0].message.content

    """
    Perform final analysis to a final cohesive answer 
    """
    
    final_analysis_system_prompt = f"""
    You are an intelligent agent tasked with integrating the answers from text analysis and table analysis to generate a final cohesive answer.
    The question posed is: {question}
    Below are the individual analyses:

    Text Analysis Result:
    {text_analysis_result}

    Table Analysis Result:
    {table_analysis_result}
    
    Formula Analysis Result:
    {formulas_analysis_result}

    Please generate a final cohesive answer integrating the information from all the analyses performed.
    Ensure that your response is structured as a JSON object under the key "result".
    
    """

    final_analysis = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": final_analysis_system_prompt},
        ],
    )

    final_analysis_result = final_analysis.choices[0].message.content

    return final_analysis_result

In [70]:
example_question = "What GHG emissions at oil and natural gas system facilities are covered under the rule?"
def chat_main(question):

    '''
    This function is the main function to run the chatbot.
    '''
    
    result = query_graph(question)
    inter_result = define_response_query(result,question)
    print(inter_result)

chat_result = chat_main(example_question)

Found 2207 matching chunks

{
    "result": "The GHG emissions at oil and natural gas system facilities covered under the rule include CO2, CH4, and N2O emissions from various source types such as natural gas pneumatic device venting, natural gas compressors, electrical generators, steam boilers, process heaters, onshore natural gas processing, gas flares, stationary and portable fuel combustion sources, gas distribution industry segment, reciprocating compressor venting, atmospheric pressure fixed roof storage tanks, residue gas compression equipment, and more. The emissions data for petroleum and natural gas sources must be quality assured as specified. Additionally, the facilities must report emissions from the natural gas distribution industry segment, uncontrolled GHG emissions, and various other emission sources as per the regulations. The specific details of GHG emissions are provided in tables W-1A, W-1E, W-2, W-3A, W-4A, W-5A, W-6A, and W-7, which specify emission factors for 

In [71]:
example_question = "Does the current GHGRP data include all U.S. emissions from petroleum and natural gas systems?"
def chat_main(question):

    '''
    This function is the main function to run the chatbot.
    '''
    
    result = query_graph(question)
    inter_result = define_response_query(result,question)
    print(inter_result)

chat_result = chat_main(example_question)

Found 1456 matching chunks

{
    "result": "The current GHGRP data includes the CO2 emissions in metric tons resulting from the complete combustion or oxidation of each exported petroleum product and natural gas liquid, as reported in paragraph (c)(2) of the section. It also covers emissions from fuel combusted in stationary or portable equipment at onshore petroleum and natural gas production facilities, as well as offshore petroleum and natural gas production facilities. Additionally, the data includes emissions from natural gas distribution and transmission pipeline industry segments, along with imported or exported petroleum products and natural gas liquids. The provided tables include default emission factors for various components and equipment in onshore petroleum and natural gas production, gathering, boosting, transmission, storage, LNG facilities, and natural gas distribution, covering both leaker emission factors and population emission factors for different services and eq

In [72]:
example_question = "In 98.233(j)(1) is the average thoroughput of oil an annual or daily average?"
def chat_main(question):

    '''
    This function is the main function to run the chatbot.
    '''
    
    result = query_graph(question)
    inter_result = define_response_query(result,question)
    print(inter_result)

chat_result = chat_main(example_question)

Found 37 matching chunks

{
    "result": "In 98.233(j)(1), the average throughput of oil is an annual average. This conclusion is supported by the text analysis, which states that the total annual oil/condensate throughput is reported on an annual basis, and the table analysis, which infers that the emission factors provided in the tables are specified in terms of hourly rates, not daily rates. Therefore, the average throughput is calculated over the entire year and reported as the total annual volume in barrels."
}


In [73]:
example_question = "How do I determine the gas to oil ratio (GOR) for my oil well operations to decide whether I need to report emissions from completions and workovers of oil wells with hydraulic fracturing?"
def chat_main(question):

    '''
    This function is the main function to run the chatbot.
    '''
    
    result = query_graph(question)
    inter_result = define_response_query(result,question)
    print(inter_result)

chat_result = chat_main(example_question)

Found 267 matching chunks

{
    "result": "To determine the gas to oil ratio (GOR) for oil well operations and decide whether emissions from completions and workovers of oil wells with hydraulic fracturing need to be reported, you should calculate the GOR for each well using the formula: GOR = Gas Production Rate (scf/day) / Oil Production Rate (bbl/day). If the calculated GOR is equal to or greater than 300 scf/STB (standard cubic feet per stock tank barrel), then emissions from completions and workovers of oil wells with hydraulic fracturing need to be reported. Additionally, you can use the default whole gas emission factors provided in Table W-1A to Subpart W of Part 98 to calculate the emissions from various components and estimate the total hydrocarbon emissions. It's important to ensure that you are using the appropriate emission factors and component counts based on the specific equipment and service type in your oil well operations. This comprehensive approach will help in de

In [74]:
example_question = "What are my pneumatic device emissions? I have 100 high bleed devices."

def chat_main(question):

    '''
    This function is the main function to run the chatbot.
    '''
    
    result = query_graph(question)
    inter_result = define_response_query(result,question)
    print(inter_result)

chat_result = chat_main(example_question)

Found 121 matching chunks

{
    "result": "Based on the information provided, if the facility contains any continuous high bleed natural gas pneumatic devices, it must report CO2, CH4, and N2O emissions from natural gas pneumatic device venting. The facility must also calculate CH4 and CO2 volumetric emissions from natural gas driven pneumatic pump venting, acid gas removal vents, dehydrator vents, blowdown vent stacks, and storage tank vented emissions. Additionally, the facility must report CH4 emissions from equipment leaks, storage tanks, loading operations, delayed coking units, and uncontrolled blowdown systems under this subpart. The combined CO2 and CH4 emissions for the natural gas pneumatic devices must be calculated using Equation W–1 of this subpart and § 98.233(a)(4), and reported in paragraph (b)(1)(i) of this section. The high continuous bleed pneumatic device vents emission factor for onshore petroleum and natural gas production and gathering in the Eastern U.S. is 37.

In [75]:
example_question = "Subpart W Table W-4 has four sections in the final Federal Register publication.  The first and third sections are both labeled 'Leaker Emission Factors Storage Station, Gas Service' and both tables have an entry for 'Open-ended Line'."

def chat_main(question):

    '''
    This function is the main function to run the chatbot.
    '''
    
    result = query_graph(question)
    inter_result = define_response_query(result,question)
    print(inter_result)

chat_result = chat_main(example_question)

Found 492 matching chunks

{
    "result": "In the final Federal Register publication, both the first and third sections of Subpart W Table W-4 have entries for 'Open-ended Line' under the label 'Leaker Emission Factors Storage Station, Gas Service'. The emission factors for the 'Open-ended Line' in both sections are 0.031 for the Eastern U.S. and 0.031 for the Western U.S. Therefore, 'Open-ended Line' must be reported for both sections in the final Federal Register publication."
}
