In [5]:
from typing import Any, Dict, List, Optional, Tuple, Union
import os
import re
import inspect

from arango import ArangoClient, AQLQueryExecuteError
from langchain.graphs import ArangoGraph
from langchain_openai import ChatOpenAI  # Updated import
from pydantic import ValidationError  # Import ValidationError
from langchain.chains import ArangoGraphQAChain

from api_key import openai_api_key, auth, url

os.environ["OPENAI_API_KEY"] = openai_api_key

# Print the signature of the ChatOpenAI class to understand expected parameters
print(inspect.signature(ChatOpenAI))

# Example initialization with detailed error output
try:
    llm = ChatOpenAI(temperature=0, model='gpt-4o')
    print("Initialization successful!")
except TypeError as e:
    print(f"Initialization failed: {e}")
except ValidationError as e:
    print(f"Validation error: {e}")

# Initialize the ArangoDB client and connect to the database
client = ArangoClient(hosts='http://127.0.0.1:8529')
db = client.db('spoke23_human', username='root', password='ph')

# Fetch the existing graph from the database
graph = ArangoGraph(db)

_VersionInfo = Tuple[int, int, int, Union[str, None], int]

class _Feature:
    def __init__(self, optionalRelease: _VersionInfo, mandatoryRelease: Optional[_VersionInfo], compiler_flag: int) -> None:
        self.optionalRelease = optionalRelease
        self.mandatoryRelease = mandatoryRelease
        self.compiler_flag = compiler_flag

    def getOptionalRelease(self) -> _VersionInfo:
        return self.optionalRelease

    def getMandatoryRelease(self) -> Optional[_VersionInfo]:
        return self.mandatoryRelease

absolute_import: _Feature
division: _Feature
generators: _Feature
nested_scopes: _Feature
print_function: _Feature
unicode_literals: _Feature
with_statement: _Feature
barry_as_FLUFL: _Feature
generator_stop: _Feature
annotations: _Feature

all_feature_names: List[str]  # undocumented

__all__ = [
    "all_feature_names",
    "absolute_import",
    "division",
    "generators",
    "nested_scopes",
    "print_function",
    "unicode_literals",
    "with_statement",
    "barry_as_FLUFL",
    "generator_stop",
    "annotations",
]

def execute_aql(qa_chain, query):
    """
    Execute the query using LangChain QA Chain.

    Parameters:
    qa_chain (ArangoGraphQAChain): An instance of ArangoGraphQAChain.
    query (str): The query string to process.

    Returns:
    tuple: A tuple containing the response and a success flag.
    """
    raw_response = qa_chain.run({qa_chain.input_key: query})

    # Ensure the response is a dictionary
    response = raw_response if isinstance(raw_response, dict) else {"result": raw_response}

    # Ensure the "aql_result" key is in the response
    response["aql_result"] = response.get("aql_result", [])

    # Check if the AQL result in the response is empty or not present
    if not response["aql_result"] or response["aql_result"] == "[]":
        return response, False
    else:
        return response, True

def execute_query_with_retries(query, max_attempts = 3):
    """
    Executes an AQL query with retries.

    Parameters:
    query (str): The query to be executed.
    max_attempts (int): Maximum number of attempts to execute the query.
    """
    attempt = 1
    success = False

    failure_message = ("The prior AQL query failed to return results. "
                       "Please think this through step by step and refine your AQL statement. "
                       "The original question is as follows:")

    while attempt <= max_attempts and not success:
        # Replace the below line with actual AQL execution logic
        response, success = execute_aql(qa_chain, query)  # Define execute_aql function or replace with actual call

        if success:
            print("AQL Query:", response["aql_query"])
            print("AQL Result:", response["aql_result"])
            print("LLM Response:", response["result"])
        else:
            if attempt < max_attempts:
                if attempt == 1:
                    query = f"{failure_message} {query}"
            else:
                print("No result found after", max_attempts, "tries.")
                print("AQL Query:", response["aql_query"])
                print("LLM Response:", response["result"])

        attempt += 1

# Instantiate the LangChain QA Chain with the model and graph
qa_chain = ArangoGraphQAChain.from_llm(llm, graph=graph, verbose=True, return_aql_query=True, return_aql_result=True)

graph_info = """
                ### Contextual Intro
                ArangoDB graph DB represents a biomedical entity network, structured with nodes & edges, each carrying biomedical data types. Nodes = entities like proteins, drugs, diseases, genes. Edges = relationships/interactions. Aim: facilitate complex queries for insights into drug discovery, disease understanding, bio research.

                ### Node Struct
                - **Sample Node**: `node_sample` JSON shows a protein node. Elements:
                - IDs: `_key`, `_id`, `_rev`.
                - `type`: "Protein".
                - `labels`: ["Protein"].
                - `properties`: Dict of relevant properties, e.g., `identifier` ("A0A1B0GTW7"), `gene`, `description`, `org_ncbi_id`, `name`, etc.

                ### Edge Struct
                - **Sample Edge**: `edge_sample` JSON. Elements:
                - IDs: `_key`, `_id`, `_rev`.
                - Connects: `_from`, `_to`.
                - `label`: Type of relationship, e.g., "INCLUDES_PCiC".
                - `properties`: Edge attributes, like `license`, `source`, `vestige`, `forward_degrees`, etc.
                - Nodes: `start`, `end` with their properties.

                ### Edge Labels
                - Variety of labels for relationship types, e.g., `ADVRESPONSE_TO_mGarC`, `ASSOCIATES_DaG`, etc.
                - Each label, like `INCLUDES_PCiC`, signifies a specific interaction or association.
                - Crucial for query construction; guide graph traversal linking entities.

                ### Aim
                By understanding node/edge structure & labels, construct effective AQL queries for exploring bio networks, uncovering insights in drug-target interactions, gene-disease associations, etc.

            """

available_edge_labels = """ADVRESPONSE_TO_mGarC
                            ASSOCIATES_DaG
                            ASSOCIATES_GaS
                            BINDS_CbP
                            BINDS_CbPD
                            CATALYZES_ECcR
                            CAUSES_CcSE
                            CAUSES_OcD
                            CLEAVESTO_PctP
                            CONSUMES_RcC
                            CONTAINS_CcG
                            CONTAINS_FcC
                            CONTRAINDICATES_CcD
                            DECREASEDIN_PdD
                            DOWNREGULATES_AdG
                            DOWNREGULATES_CdG
                            DOWNREGULATES_GPdG
                            DOWNREGULATES_KGdG
                            DOWNREGULATES_OGdG
                            ENCODES_GeM
                            ENCODES_GeP
                            EXPRESSEDIN_GeiCT
                            EXPRESSEDIN_GeiD
                            EXPRESSEDIN_PeCT
                            EXPRESSES_AeG
                            HAS_PhEC
                            INCLUDES_OiPW
                            INCLUDES_PCiC
                            INCREASEDIN_PiD
                            INTERACTS_PDiPD
                            INTERACTS_PiC
                            INTERACTS_PiP
                            ISA_AiA
                            ISA_CTiCT
                            ISA_DiD
                            ISA_ECiEC
                            ISA_FiF
                            ISA_OiO
                            ISA_PWiPW
                            LOCALIZES_DlA
                            MARKER_NEG_GmnD
                            MARKER_POS_GmpD
                            MEMBEROF_PDmPF
                            PARTICIPATES_CpR
                            PARTICIPATES_GpBP
                            PARTICIPATES_GpCC
                            PARTICIPATES_GpMF
                            PARTICIPATES_GpPW
                            PARTICIPATES_GpR
                            PARTICIPATES_PpR
                            PARTOF_ApA
                            PARTOF_CTpA
                            PARTOF_PDpP
                            PARTOF_PpC
                            PARTOF_RpPW
                            PRESENTS_DpS
                            PRODUCES_RpC
                            REDUCES_SEN_mGrsC
                            RESEMBLES_DrD
                            RESISTANT_TO_mGrC
                            RESPONSE_TO_mGrC
                            TARGETS_MtG
                            TRANSPORTS_PtC
                            TREATS_CtD
                            UPREGULATES_AuG
                            UPREGULATES_CuG
                            UPREGULATES_GPuG
                            UPREGULATES_KGuG
                            UPREGULATES_OGuG
                            """

few_shot = """<Example Question 1>Question 1: What are the known targets of the drug Metformin, and what diseases are these targets most commonly associated with? </Example Question 1>
              <Example Answer 1>AQL Statement 1: WITH Nodes, Edges
                                FOR compound IN Nodes
                                    FILTER 'Compound' IN compound.labels
                                    AND (
                                        compound.properties.identifier LIKE '%Metformin%'
                                        OR compound.properties.name LIKE '%Metformin%'
                                        OR compound.properties.synonyms LIKE '%Metformin%'
                                    )
                                    FOR edge IN Edges
                                        FILTER edge._from == compound._id
                                        FOR relatedNode IN Nodes
                                            FILTER relatedNode._id == edge._to
                                            RETURN {
                                                metformin: {
                                                    identifier: compound.properties.identifier,
                                                    name: compound.properties.name,
                                                    chembl_id: compound.properties.chembl_id
                                                },
                                                related: {
                                                    identifier: relatedNode.properties.identifier,
                                                    name: relatedNode.properties.name,
                                                    chembl_id: relatedNode.properties.chembl_id,
                                                    // Include any other fields you need from relatedNode
                                                },
                                                edgeLabel: edge.label
                                            }</Example Answer 1>
                
                <Example Question 2>Question 2: Which genes are most strongly associated with the development of Type 2 Diabetes, and what pathways do they influence?</Example Question 2>
                <Example Answer 2>AQL Statement 2: WITH Nodes, Edges
                                LET type2DiabetesGenes = (
                                    FOR disease IN Nodes
                                        FILTER 'Disease' IN disease.labels
                                        AND (
                                            (CONTAINS(LOWER(disease.properties.name), 'type 2') AND CONTAINS(LOWER(disease.properties.name), 'diabetes'))
                                            OR 
                                            (CONTAINS(LOWER(disease.properties.synonyms), 'type 2') AND CONTAINS(LOWER(disease.properties.synonyms), 'diabetes'))
                                        )
                                        FOR edge IN Edges
                                            FILTER edge._from == disease._id
                                            AND edge.label == 'ASSOCIATES_DaG'
                                            FOR geneNode IN Nodes
                                                FILTER geneNode._id == edge._to
                                                AND 'Gene' IN geneNode.labels
                                                COLLECT geneId = geneNode._id INTO genes
                                                RETURN geneId
                                )
                                FOR geneId IN type2DiabetesGenes
                                    FOR pathwayEdge IN Edges
                                        FILTER pathwayEdge._from == geneId
                                        AND pathwayEdge.label == 'PARTICIPATES_GpPW' // Assuming this label connects genes to pathways
                                        FOR pathwayNode IN Nodes
                                            FILTER pathwayNode._id == pathwayEdge._to
                                            AND 'Pathway' IN pathwayNode.labels
                                            RETURN {
                                                geneId: geneId,
                                                pathway: {
                                                    identifier: pathwayNode.properties.identifier,
                                                    name: pathwayNode.properties.name
                                                    // Add other properties you need
                                                }
                                            }</Example Answer 2>
"""

# Base Prompt
base_prompt = f"""
    <System Instructions>Answer the above question using the following data model and AQL query template.</System Instructions>
    
    <Graph Description>{graph_info}</Graph Description>

    <Edge Label Description>This is a list of the available edge labels in the graph. You can use these to filter edges in your AQL query.</Edge Label Description>
    <Available Edge Labels>{available_edge_labels}</Available Edge Labels>
    
    <Example Few-Shot Description>These questions and AQL queries demonstrate how to construct working AQL queries based on natural language questions using the provided node, edge, and edge label information. To adapt this query for different scenarios, modify the entity types, filter conditions, and return statements based on your specific data and question.</Example Few-Shot Description>
    <Example Few-Shot>{few_shot}</Example Few-Shot>
"""


(*, name: Optional[str] = None, cache: ForwardRef('Union[BaseCache, bool, None]') = None, verbose: bool = None, callbacks: ForwardRef('Callbacks') = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, custom_get_token_ids: Optional[Callable[[str], List[int]]] = None, callback_manager: Optional[langchain_core.callbacks.base.BaseCallbackManager] = None, client: Any = None, async_client: Any = None, model: str = 'gpt-3.5-turbo', temperature: float = 0.7, model_kwargs: Dict[str, Any] = None, api_key: Optional[pydantic.v1.types.SecretStr] = None, base_url: Optional[str] = None, organization: Optional[str] = None, openai_proxy: Optional[str] = None, timeout: Union[float, Tuple[float, float], Any, NoneType] = None, max_retries: int = 2, streaming: bool = False, n: int = 1, max_tokens: Optional[int] = None, tiktoken_model_name: Optional[str] = None, default_headers: Optional[Mapping[str, str]] = None, default_query: Optional[Mapping[str, object]] = None, http_cli

In [6]:
question = "Which genes are most strongly associated with the development of Type 2 Diabetes, and what pathways do they influence?"
execute_query_with_retries(question + base_prompt, 3)

  warn_deprecated(




[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH Nodes, Edges
LET type2DiabetesGenes = (
    FOR disease IN Nodes
        FILTER 'Disease' IN disease.labels
        AND (
            (CONTAINS(LOWER(disease.properties.name), 'type 2') AND CONTAINS(LOWER(disease.properties.name), 'diabetes'))
            OR 
            (CONTAINS(LOWER(disease.properties.synonyms), 'type 2') AND CONTAINS(LOWER(disease.properties.synonyms), 'diabetes'))
        )
        FOR edge IN Edges
            FILTER edge._from == disease._id
            AND edge.label == 'ASSOCIATES_DaG'
            FOR geneNode IN Nodes
                FILTER geneNode._id == edge._to
                AND 'Gene' IN geneNode.labels
                COLLECT geneId = geneNode._id INTO genes
                RETURN geneId
)
FOR geneId IN type2DiabetesGenes
    FOR pathwayEdge IN Edges
        FILTER pathwayEdge._from == geneId
        AND pathwayEdge.label == 'PARTICIPATES_GpPW'
        FOR pathwayNo

KeyError: 'aql_query'