In [None]:
#!pip install --quiet langchain-anthropic langchain-neo4j cyVer langchain-google-genai json-repair "numpy<2"

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from tqdm import tqdm
import pandas as pd
import josn

from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI

from utils import (
    _value_sanitize,
    extract_json_from_markdown,
    sampling_query,
    validate_cypher,
    process_database,
    process_all_examples_with_limit,
    convert_datetime
)
from prompts import (
    system_prompt,
    simple_system_prompt,
)

# Generate dataset

In [3]:
# LLM selection
models = [ChatAnthropic(model='claude-opus-4-20250514')] #, ChatGoogleGenerativeAI(model="gemini-2.5-pro")]

# Database selection (for demo database)
db_url = "neo4j+s://demo.neo4jlabs.com"
databases = [
    "companies",
    "twitch", 
    "network",
    "northwind",
    "ClinicalKnowledgeGraph"
]


In [4]:
simple_batch_count = 1 # Number of iterations for simple queries
multi_batch_count = 1 # Number of iterations complex queries

output = []

for model in models:
    print(model.model)
    for credential in tqdm(databases, desc="Processing databases"):
        # Simple question
        database_records = process_database(
            credential, db_url, model, simple_batch_count, simple_system_prompt
        )
        output.extend(database_records)

        database_records = process_database(
            credential, db_url, model, multi_batch_count, system_prompt
        )
        output.extend(database_records)

claude-opus-4-20250514


Processing databases:   0%|                                 | 0/5 [00:00<?, ?it/s]
Iterations for companies:   0%|                             | 0/1 [00:00<?, ?it/s][A
Iterations for companies: 100%|█████████████████████| 1/1 [01:21<00:00, 81.19s/it][A
                                                                                  [A
Iterations for companies:   0%|                             | 0/1 [00:00<?, ?it/s][A
Iterations for companies: 100%|████████████████████| 1/1 [01:53<00:00, 113.25s/it][A
Processing databases:  20%|████▊                   | 1/5 [03:19<13:18, 199.74s/it][A
Iterations for twitch:   0%|                                | 0/1 [00:00<?, ?it/s][A
Iterations for twitch: 100%|████████████████████████| 1/1 [01:28<00:00, 88.42s/it][A
                                                                                  [A
Iterations for twitch:   0%|                                | 0/1 [00:00<?, ?it/s][A
Iterations for twitch: 100%|███████████████████████| 1/1 

# Generate text answers

In [5]:
qa_model = ChatAnthropic(model='claude-3-5-haiku-latest')

In [6]:
validated = [el for el in output if el["validated"]]

In [7]:
len(validated)

211

In [8]:
# Generate text-based answers
await process_all_examples_with_limit(validated, qa_model)

Processing examples: 100%|██████████████████████| 211/211 [00:32<00:00,  6.56it/s]


In [11]:
# If the question cannot be answered, remove record
validated = [el for el in validated if not "UNKNOWN" in el['answer']]

df = pd.DataFrame.from_records(validated)
print(f"Total QA pairs: {len(df)}")
df.head(5)

Total QA pairs: 205


Unnamed: 0,question,cypher,query_type,complexity,noise_applied,model,database,validated,result,answer,noise_type
0,What's Microsoft Corporation's revenue?,MATCH (o:Organization {name: 'Microsoft Corpor...,Direct Property Access,0-hop,False,anthropic-chat,companies,True,[{'company_revenue': 198270000000.0}],"According to the data, Microsoft Corporation's...",
1,How many employees does SAP have?,MATCH (o:Organization {name: 'SAP'}) RETURN o....,Direct Property Access,0-hop,False,anthropic-chat,companies,True,[{'employee_count': 111961}],"According to the data, SAP has 111,961 employees.",
2,Tell me Mircosoft Corporation's motto,MATCH (o:Organization {name: 'Microsoft Corpor...,Direct Property Access,0-hop,True,anthropic-chat,companies,True,[{'company_motto': 'We’re on a mission to empo...,"Microsoft Corporation's motto is: ""We're on a ...",typo
3,Is Cloud Sherpas still active?,MATCH (o:Organization {name: 'Cloud Sherpas'})...,Direct Property Access,0-hop,False,anthropic-chat,companies,True,[{'is_dissolved': True}],"No, Cloud Sherpas is not still active. The que...",
4,What's the revenue of New Energy Group?,MATCH (o:Organization {name: 'New Energy Group...,Direct Property Access,0-hop,False,anthropic-chat,companies,True,[{'company_revenue': 120000000.0}],"The revenue of New Energy Group is $120,000,000.",


In [12]:
# Assuming 'output' is defined elsewhere in your code
with open("generated_dataset.json", "w") as f:
    json.dump(validated, f, indent=2, default=convert_datetime)