In [1]:
#!pip install --quiet langchain-anthropic langchain-neo4j cyVer langchain-google-genai json-repair "numpy<2"

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from tqdm import tqdm
import pandas as pd

from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI

from utils import (
    _value_sanitize,
    extract_json_from_markdown,
    sampling_query,
    validate_cypher,
    process_database,
    process_all_examples_with_limit,
    convert_datetime
)
from prompts import (
    system_prompt,
    simple_system_prompt,
)

# Generate dataset

In [4]:
# LLM selection
models = [ChatAnthropic(model='claude-opus-4-20250514'), ChatGoogleGenerativeAI(model="gemini-2.5-pro")]

# Database selection (for demo database)
db_url = "neo4j+s://demo.neo4jlabs.com"
databases = [
    "companies",
    "twitch", 
    "network",
    "northwind",
    "ClinicalKnowledgeGraph"
]


In [5]:
simple_batch_count = 1 # Number of iterations for simple queries
multi_batch_count = 3 # Number of iterations complex queries

output = []

for model in models:
    print(model.model)
    for credential in tqdm(databases, desc="Processing databases"):
        # Simple question
        database_records = process_database(
            credential, db_url, model, simple_batch_count, simple_system_prompt
        )
        output.extend(database_records)

        database_records = process_database(
            credential, db_url, model, multi_batch_count, system_prompt
        )
        output.extend(database_records)

claude-opus-4-20250514


Processing databases:   0%|                                 | 0/5 [00:00<?, ?it/s]
Iterations for companies:   0%|                             | 0/1 [00:00<?, ?it/s][A
Iterations for companies: 100%|████████████████████| 1/1 [02:36<00:00, 156.98s/it][A
                                                                                  [A
Iterations for companies:   0%|                             | 0/3 [00:00<?, ?it/s][A
Iterations for companies:  33%|██████▋             | 1/3 [03:17<06:34, 197.37s/it][A
Iterations for companies:  67%|█████████████▎      | 2/3 [06:18<03:07, 187.79s/it][A
Iterations for companies: 100%|████████████████████| 3/3 [10:52<00:00, 227.13s/it][A
Processing databases:  20%|████▊                   | 1/5 [13:34<54:17, 814.41s/it][A
Iterations for twitch:   0%|                                | 0/1 [00:00<?, ?it/s][A
Iterations for twitch: 100%|███████████████████████| 1/1 [02:26<00:00, 146.25s/it][A
                                                         

models/gemini-2.5-pro


Processing databases:   0%|                                 | 0/5 [00:00<?, ?it/s]
Iterations for companies:   0%|                             | 0/1 [00:00<?, ?it/s][A
Iterations for companies: 100%|████████████████████| 1/1 [02:36<00:00, 156.71s/it][A
                                                                                  [A
Iterations for companies:   0%|                             | 0/3 [00:00<?, ?it/s][A

Iterations for companies:  67%|█████████████▎      | 2/3 [11:47<05:50, 350.45s/it][A
Iterations for companies: 100%|████████████████████| 3/3 [17:12<00:00, 338.91s/it][A
Processing databases:  20%|████▏                | 1/5 [19:54<1:19:38, 1194.59s/it][A
Iterations for twitch:   0%|                                | 0/1 [00:00<?, ?it/s][A
Iterations for twitch: 100%|███████████████████████| 1/1 [02:38<00:00, 158.28s/it][A
                                                                                  [A
Iterations for twitch:   0%|                            

# Generate text answers

In [6]:
qa_model = ChatAnthropic(model='claude-3-5-haiku-latest')

In [7]:
# Generate text-based answers
await process_all_examples_with_limit(output, qa_model)

Processing examples: 100%|████████████████████| 1982/1982 [04:12<00:00,  7.86it/s]


In [8]:
df = pd.DataFrame.from_records(output)
df.head(5)

Unnamed: 0,question,cypher,query_type,complexity,noise_applied,model,validated,result,answer,noise_type
0,What's Microsoft Corporation's revenue?,MATCH (o:Organization {name: 'Microsoft Corpor...,Direct Property Access,0-hop,False,anthropic-chat,True,[{'company_revenue': 198270000000.0}],Microsoft Corporation's revenue is $198.27 bil...,
1,How many employees does SAP have?,MATCH (o:Organization {name: 'SAP'}) RETURN o....,Direct Property Access,0-hop,False,anthropic-chat,True,[{'employee_count': 111961}],"SAP has 111,961 employees.",
2,What's Mircosoft's motto?,MATCH (o:Organization {name: 'Microsoft Corpor...,Direct Property Access,0-hop,True,anthropic-chat,True,[{'company_motto': 'We’re on a mission to empo...,"Microsoft's motto is: ""We're on a mission to e...",typo
3,Is Cloud Sherpas still active?,MATCH (o:Organization {name: 'Cloud Sherpas'})...,Direct Property Access,0-hop,False,anthropic-chat,True,[{'is_dissolved': True}],"Based on the query result, Cloud Sherpas is no...",
4,Tell me IBM's summary,MATCH (o:Organization {name: 'IBM'}) RETURN o....,Direct Property Access,0-hop,False,anthropic-chat,True,[{'company_summary': 'American multinational t...,IBM is an American multinational technology an...,


In [14]:
import pandas as pd
import json
from datetime import datetime
import neo4j

def convert_datetime(obj):
    if isinstance(obj, (pd.Timestamp, datetime, neo4j.time.DateTime)):
        return obj.isoformat()
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Assuming 'output' is defined elsewhere in your code
with open("generated_dataset.json", "w") as f:
    json.dump(output, f, indent=2, default=convert_datetime)