In [None]:
#!pip install --quiet langchain-anthropic langchain-neo4j cyVer langchain-google-genai json-repair "numpy<2"

In [None]:
from dotenv import load_dotenv, dotenv_values
import json
from tqdm import tqdm
import pandas as pd
import time
import os

from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI

from utils import (
    _value_sanitize,
    extract_json_from_markdown,
    sampling_query,
    validate_cypher,
    process_database,
    process_all_examples_with_limit,
    convert_datetime
)
from prompts import (
    system_prompt,
    simple_system_prompt,
)

In [None]:
config = dotenv_values("run.env")

In [None]:
# [Db Name, Login, Pwd, URI]
DATABASES=json.loads(config.get('DATABASES'))
# [LLM Name, Model], can be GOOGLE, CLAUDE
LLM_CREATES=json.loads(config.get('LLM_CREATES'))
# GOOGLE, CLAUDE
LLM_QA=config.get('LLM_QA')
LLM_QA_MODEL=config.get('LLM_QA_MODEL')
LLM_QA_API_KEY=config.get('LLM_QA_API_KEY')

# Generate dataset

In [None]:
# LLM selection
models = []
for l in LLM_CREATES:
    if l[0] == 'CLAUDE':
        os.environ["ANTHROPIC_API_KEY"] = l[2]
        models.append(ChatAnthropic(model=l[1]))
    elif l[0] == 'GOOGLE':
        os.environ["GOOGLE_API_KEY"] = l[2]
        models.append(ChatGoogleGenerativeAI(model=l[1]))
    else:
        print("Incorrect LLM provided")

In [None]:
simple_batch_count = 1 # Number of iterations for simple queries
multi_batch_count = 1 # Number of iterations complex queries

output = []

for model in models:
    print(model.model)
    for database in tqdm(DATABASES, desc="Processing databases"):
        # Simple question
        database_records = process_database(
            database, model, simple_batch_count, simple_system_prompt
        )
        output.extend(database_records)

        database_records = process_database(
            database, model, multi_batch_count, system_prompt
        )
        output.extend(database_records)

# Generate text answers

In [None]:
qa_model = None

if LLM_QA == 'CLAUDE':
    os.environ["ANTHROPIC_API_KEY"] = LLM_QA_API_KEY
    qa_model = ChatAnthropic(model=LLM_QA_MODEL)
elif LLM_QA == 'GOOGLE':
    os.environ["GOOGLE_API_KEY"] = LLM_QA_API_KEY
    qa_model = ChatGoogleGenerativeAI(model=LLM_QA_MODEL)
else:
    print("No LLM provided for generating the text answers")

In [None]:
validated = [el for el in output if el["validated"]]

In [None]:
len(validated)

In [None]:
# Generate text-based answers
await process_all_examples_with_limit(validated, qa_model)

In [None]:
# If the question cannot be answered, remove record
validated = [el for el in validated if not "UNKNOWN" in el['answer']]

df = pd.DataFrame.from_records(validated)
print(f"Total QA pairs: {len(df)}")
df.head(5)

In [None]:
# Assuming 'output' is defined elsewhere in your code
timestr = time.strftime("%Y%m%d-%H%M%S")
print(timestr)
with open(f"generated_dataset_{timestr}.json", "w") as f:
    json.dump(validated, f, indent=2, default=convert_datetime)