In [None]:
import os
import json
import numpy as np
import pandas as pd

from langchain_openai import ChatOpenAI
import getpass
import os

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")

from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

from langchain_core.tools import tool

## Query Decomposition

In [2]:
import datetime
from typing import Literal, Optional, Tuple

from langchain_core.pydantic_v1 import BaseModel, Field


class SubQuery(BaseModel):
    """Search over a GeoSpatial Database and a Regulation Document for geospatial functions."""

    sub_query: str = Field(
        ...,
        description="A query against the geospatial database and regulation document.",
    )

In [3]:
examples = []

question = "What areas in London have at least 20,000 people per square kilometer and fewer than 5 fast-food restaurants per square kilometer?"
queries = [
    SubQuery(sub_query="What areas in London have at least 20,000 people per square kilometer?"),
    SubQuery(sub_query="Check if each selected area has fewer than 5 fast-food restaurants per square kilometer"),
]
examples.append({"input": question, "tool_calls": queries})

question = "Where in London can I open a fast-food restaurant that is located at least 400 meters away from any school and has a population density of at least 20,000 people per square kilometer?"
queries = [
    SubQuery(sub_query="Find areas in London with a population density of at least 20,000 people per square kilometer."),
    SubQuery(sub_query="Check if each selected area is at least 400 meters away from any school."),
]
examples.append({"input": question, "tool_calls": queries})

question = "Which areas in London have a population density of at least 20,000 people per square kilometer and fewer than 5 fast-food restaurants per square kilometer?"
queries = [
    SubQuery(sub_query="Find areas in London with a population density of at least 20,000 people per square kilometer."),
    SubQuery(sub_query="Check of each selected area is with fewer than 5 fast-food restaurants"),
]
examples.append({"input": question, "tool_calls": queries})

question = "Can you suggest locations in London with more than 20,000 people per square kilometer and fewer than 5 fast-food restaurants and is not located within 400 meters of a school?"
queries = [
    SubQuery(sub_query="Find areas in London with a population density greater than 20,000 people per square kilometer."),
    SubQuery(sub_query="Check if each selected area is with fewer than 5 fast-food restaurants."),
    SubQuery(sub_query="Validate whether each selected area is not within 400 meters of a school."),
]
examples.append({"input": question, "tool_calls": queries})

question = "What are some good locations in London to open a fast-food restaurant?"
queries = [
    SubQuery(sub_query="Find areas in London with a population density greater than 20,000 people per square kilometer."),
    SubQuery(sub_query="Validate whether each selected area has no more than 5 fast-food restaurants."),
]
examples.append({"input": question, "tool_calls": queries})

question = "Which areas in London would be ideal for a fast-food restaurant?"
queries = [
    SubQuery(sub_query="Find areas in London with a population density greater than 20,000 people per square kilometer."),
    SubQuery(sub_query="Check whether each selected area has fewer than 5 fast-food restaurants."),
]
examples.append({"input": question, "tool_calls": queries})

question = "What locations in London to open a fast-food restaurant that comply with urban regulations?"
queries = [
    SubQuery(sub_query="Find areas in London with a population density greater than 20,000 people per square kilometer."),
    SubQuery(sub_query="Check whether each selected area has fewer than 5 fast-food restaurants."),
    SubQuery(sub_query="Check if each selected area is not within 400 meters of a school."),
]
examples.append({"input": question, "tool_calls": queries})

question = "Which areas in London would be possible to open a fast-food restaurant that adherence to local regulations?"
queries = [
    SubQuery(sub_query="Find areas in London with a population density greater than 20,000 people per square kilometer."),
    SubQuery(sub_query="Check whether each selected area has fewer than 5 fast-food restaurants."),
    SubQuery(sub_query="Check if each selected area is not within 400 meters of a school."),
]
examples.append({"input": question, "tool_calls": queries})

In [4]:
import uuid
from typing import Dict, List

from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)


def tool_example_to_messages(example: Dict) -> List[BaseMessage]:
    messages: List[BaseMessage] = [HumanMessage(content=example["input"])]
    openai_tool_calls = []
    for tool_call in example["tool_calls"]:
        openai_tool_calls.append(
            {
                "id": str(uuid.uuid4()),
                "type": "function",
                "function": {
                    "name": tool_call.__class__.__name__,
                    "arguments": tool_call.json(),
                },
            }
        )
    messages.append(
        AIMessage(content="", additional_kwargs={"tool_calls": openai_tool_calls})
    )
    tool_outputs = example.get("tool_outputs") or [
        "This is an example of a correct usage of this tool. Make sure to continue using the tool this way."
    ] * len(openai_tool_calls)
    for output, tool_call in zip(tool_outputs, openai_tool_calls):
        messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))
    return messages


example_msgs = [msg for ex in examples for msg in tool_example_to_messages(ex)]

In [7]:
from langchain.output_parsers import PydanticToolsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.prompts import MessagesPlaceholder

system = """You are an expert at converting user questions into natural language tasks. \
You have access to a geospatial database and a regulation document for geospatial functions. \

Perform query decomposition. Given a user question, break it down into the most specific sub questions you can \
which will help you answer the original question. \
Each sub question should be a basic task that can be answer by the geospatial database.
"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        MessagesPlaceholder("examples", optional=True),
        ("human", "{question}"),
    ]
)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
llm_with_tools = llm.bind_tools([SubQuery])

parser = PydanticToolsParser(tools=[SubQuery])

query_analyzer_with_examples = (
    prompt.partial(examples=example_msgs) | llm_with_tools | parser
)

In [8]:
tasks = query_analyzer_with_examples.invoke(
    {
        "question": "Find three good locations in London to open a fast food restaurant"
    }
)
tasks

[SubQuery(sub_query='Identify areas in London with a population density greater than 20,000 people per square kilometer.'),
 SubQuery(sub_query='Determine areas with fewer than 5 fast-food restaurants.'),
 SubQuery(sub_query='Ensure selected areas are not within 400 meters of any school.')]

In [9]:
tasks = query_analyzer_with_examples.invoke(
    {
        "question": "Find three good locations in London to open a fast food restaurant and they comply with all the relevant regulations"
    }
)
tasks

[SubQuery(sub_query='Identify areas in London with a population density greater than 20,000 people per square kilometer.'),
 SubQuery(sub_query='Determine if each selected area has fewer than 5 fast-food restaurants.'),
 SubQuery(sub_query='Verify if each selected area is not within 400 meters of a school.')]

In [19]:
for i in range(len(tasks)):
    tasks[i] = tasks[i].sub_query
tasks

['Identify areas in London with a population density greater than 20,000 people per square kilometer.',
 'Determine if each selected area has fewer than 5 fast-food restaurants.',
 'Verify if each selected area is not within 400 meters of a school.']

## GeoDomain and GeoSpatial Tools

In [10]:
# POP data
import rasterio
from rasterio.transform import rowcol

input_file = '../data/gbr_pd_2020_1km_UNadj.tif'
bbox = (-0.57, 51.25, 0.37, 51.72) # London bbox
with rasterio.open(input_file) as src:
    data = src.read(1)
    transform = src.transform
    
    top_left = rowcol(transform, bbox[0], bbox[3])
    bottom_right = rowcol(transform, bbox[2], bbox[1])
    top_left = (int(top_left[0]), int(top_left[1]))
    bottom_right = (int(bottom_right[0]), int(bottom_right[1]))
    print(top_left, bottom_right)
    
    data_cropped = data[top_left[0]:bottom_right[0], top_left[1]:bottom_right[1]]
    pop_flat_cropped = data_cropped.flatten()
    indices_cropped = np.nonzero(pop_flat_cropped > 0)[0]
    rows_cropped, cols_cropped = np.unravel_index(indices_cropped, data_cropped.shape)
    rows_adjusted = rows_cropped + top_left[0]
    cols_adjusted = cols_cropped + top_left[1]
    
    populations = [data[row, col] for row, col in zip(rows_adjusted, cols_adjusted)]
    center_coords = [tuple(reversed(rasterio.transform.xy(transform, row, col, offset='center'))) \
                    for row, col in zip(rows_adjusted, cols_adjusted)]
    ul_coords = [tuple(reversed(rasterio.transform.xy(transform, row, col, offset='ul'))) \
                    for row, col in zip(rows_adjusted, cols_adjusted)]
    ur_coords = [tuple(reversed(rasterio.transform.xy(transform, row, col, offset='ur'))) \
                    for row, col in zip(rows_adjusted, cols_adjusted)]
    ll_coords = [tuple(reversed(rasterio.transform.xy(transform, row, col, offset='ll'))) \
                    for row, col in zip(rows_adjusted, cols_adjusted)]
    lr_coords = [tuple(reversed(rasterio.transform.xy(transform, row, col, offset='lr'))) \
                    for row, col in zip(rows_adjusted, cols_adjusted)]
    
print(len(populations))
# save to numpy file for London populations
london_pops_file = '../data/london_pop.npz'
np.savez(london_pops_file, populations=populations, center_coords=center_coords, 
         ul_coords=ul_coords, ur_coords=ur_coords, ll_coords=ll_coords, lr_coords=lr_coords)

(1097, 969) (1153, 1082)
6325


In [11]:
def pop_top_k(city_name, top_k, desc=True):
    area_ids = []
    bbox_list = []
    if 'london' in city_name.lower():
        # read population numpy file
        london_pops_file = '../data/london_pop.npz'
        london_pops = np.load(london_pops_file)
        pop = london_pops['populations']
        ul_coords = london_pops['ul_coords']
        ur_coords = london_pops['ur_coords']
        ll_coords = london_pops['ll_coords']
        lr_coords = london_pops['lr_coords']
        center_coords = london_pops['center_coords']
        
        indices = list(range(len(pop)))
        indices.sort(key=lambda i: pop[i], reverse=(desc==True))
        for idx in indices[:top_k]:
            #print(pop[idx])
            area_ids.append(idx)
            bbox = (lr_coords[idx][1], lr_coords[idx][0], ul_coords[idx][1], ul_coords[idx][0])
            #print(bbox)
            bbox_list.append(bbox)
    print('area_ids_candidate:', area_ids)
    return area_ids, bbox_list

def pop_by_density(city_name, pop_density):
    area_ids = []
    bbox_list = []
    if 'london' in city_name.lower():
        # read population numpy file
        london_pops_file = '../data/london_pop.npz'
        london_pops = np.load(london_pops_file)
        pop = london_pops['populations']
        ul_coords = london_pops['ul_coords']
        ur_coords = london_pops['ur_coords']
        ll_coords = london_pops['ll_coords']
        lr_coords = london_pops['lr_coords']
        center_coords = london_pops['center_coords']
        
        indices = list(range(len(pop)))
        indices.sort(key=lambda i: pop[i], reverse=True)
        for idx in indices:
            if pop[idx] < pop_density:
                break
            area_ids.append(idx)
            bbox = (lr_coords[idx][1], lr_coords[idx][0], ul_coords[idx][1], ul_coords[idx][0])
            #print(bbox)
            bbox_list.append(bbox)
    print('area_ids_candidate:', area_ids)
    return area_ids, bbox_list

In [12]:
import requests
# https://github.com/geopy/geopy/issues/262
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="UrbanDevQA")
headers = geolocator.headers

def geo(bbox, query, limit=10):
    query = query.replace(' ','+')
    results = []
    viewbox = "{},{},{},{}".format(bbox[0], bbox[1], bbox[2], bbox[3])
    nominatim_url = f"https://nominatim.openstreetmap.org/search?q={query}&format=json&limit={limit}&viewbox={viewbox}&bounded=1"
    #print(nominatim_url)
    response = requests.get(nominatim_url, headers=headers)
    #print(response.text)
    data = json.loads(response.text)
    #print(data)
    if 'error' in data:
        results = []
    else:
        results = data
    #print(results)
    return results



In [13]:
from typing import Any, Dict, List, Optional, Tuple, Union, cast

@tool
def search_population_density_topk(city_name: str, top_k: int) -> str:
    """Using this tool to find top areas with the highest population density in a city. Each area is 1 km square.

    Parameters:
        city_name: The city name (for example: London)
        top_k: the number of top areas (for example: 5)

    Returns:
        List[Dict[str, Any]]: A list of AreaObject dictionaries containing the area id and
            bounding box of the area with the format (lat0, lng0, lat1, lng1). 
            lat0 and lng0 are the coordinates of the lower-right 
            and lat1 and lng1 are the coordinates of the upper-left of the area.
            
    Example
    -------
        >>> search_population_density_topk("London", 5)
    """
    print('search_population_density_topk')
    area_ids, bbox_list = pop_top_k(city_name, top_k)
    results = []
    for idx in range(len(area_ids)):
        results.append({'area_id': area_ids[idx], 'bbox': bbox_list[idx]})
    return results

@tool
def search_population_density_value(city_name: str, value: float) -> str:
    """Using this tool to find areas with the population at least a specific value. Each area is 1 km square.

    Parameters:
        city_name: The city name (for example: London)
        value: the population value to search for (for example: 20000)

    Returns:
        List[Dict[str, Any]]: A list of AreaObject dictionaries containing the area id and
            bounding box of the area with the format (lat0, lng0, lat1, lng1). 
            lat0 and lng0 are the coordinates of the lower-right 
            and lat1 and lng1 are the coordinates of the upper-left of the area.
            
    Example
    -------
        >>> search_population_density_value("London", 20000)
    """
    print('search_population_density_value')
    area_ids, bbox_list = pop_by_density(city_name, value)
    results = []
    for idx in range(len(area_ids)):
        results.append({'area_id': area_ids[idx], 'bbox': bbox_list[idx]})
    return results

@tool
def check_within_area(area_obj: Dict[str, Any], query: str, condition: str) -> bool:
    """check if query objects exist within the area object

    Parameters:
        area_obj(Dict[str, Any]): The area object to check within condition
        query: Query string to find objects. For example: use query='fast-food' to find fast-food restaurant within.
        condition: the Condition to check (for example: condition=’< 5’ to check if the number of query object within an area is less than 5.

    Returns:
        True or False
        
    Example
    -------
        >>> check_within_area(area_obj, "fast-food", "< 5")
    """
    print('check_within_area')
    area_id = area_obj['area_id']
    bbox = area_obj['bbox']
    
    tokens = condition.split(' ')
    cond = tokens[0].strip()
    limit = tokens[1].strip()
    limit = int(limit)
    # call geospatial tool
    results = geo(bbox, query, limit)
    
    if len(results) < limit:
        return True
    else:
        return False

@tool
def check_near_by_area(area_obj: Dict[str, Any], query: str, condition: str) -> bool:
    """check if query objects do not exist nearby the area object

    Parameters:
        area_obj(Dict[str, Any]): The area object to check nearby condition
        query: Query string to find objects. For example: use query='school' to find schools nearby.
        condition: The condition to check. For example: condition='400m' to check if query objects do not exist nearby an area by 400m. 

    Returns:
        True or False
        
    Example
    -------
        >>> check_near_by_area(area_obj, "school", "400m")
    """
    print('check_near_by_area')
    import re
    tokens = re.split(' > ', condition)
    distance = tokens[-1].strip()
    distance = int(re.search(r'\d+', distance).group())
    
    area_id = area_obj['area_id']
    bbox = area_obj['bbox']
    # compute new boundingbox from AreaObject with distance
    import geopy.distance
    left_point = (bbox[0], bbox[1])
    # go East first
    new_left_point = geopy.distance.distance(meters=distance).destination(left_point, bearing=90)
    new_left_point = (new_left_point[0], new_left_point[1])
    # then go north
    new_left_point = geopy.distance.distance(meters=distance).destination(new_left_point, bearing=90)
    
    right_point = (bbox[2], bbox[3])
    # go West first
    new_right_point = geopy.distance.distance(meters=distance).destination(right_point, bearing=-90)
    new_right_point = (new_right_point[0], new_right_point[1])
    # then go south
    new_right_point = geopy.distance.distance(meters=distance).destination(new_right_point, bearing=180)
    
    new_bbox = (new_left_point[0], new_left_point[1], new_right_point[0], new_right_point[1])
    # call geospatial tool
    results = geo(new_bbox, query)
    #print(results)
    if len(results) > 0:
        return False # exist object
    else:
        return True # do not exist
    

tools = [search_population_density_topk, search_population_density_value, check_within_area, check_near_by_area]


In [14]:
check_near_by_area.invoke({"area_obj": {'area_id': 2446, 'bbox': [0.04041666553887957,51.54124997447413,0.03208333223888005,51.54958330777413]}, "query":"fast-food", "condition":"400m"})


check_near_by_area


True

## Task Translator

In [15]:
from langchain.tools.render import render_text_description

rendered_tools = ""
for tool in tools:
    rendered_tools += tool.name + " - " + tool.description + "\n\n"
tool_names = ",".join(tool.name for tool in tools)

In [16]:

from langchain_core.prompts import ChatPromptTemplate

system_prompt = f"""You are an assistant that has access to the following set of tools. 

{rendered_tools}

Given the input question, you must return the correct tool to use.
Return your response as a JSON blob with 'name' and 'arguments' keys. Do not explain or give any comments.
"""

#print(system_prompt)

prompt = ChatPromptTemplate.from_messages(
    [("system", system_prompt), ("user", "Question: {input}")]
)

In [21]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

from operator import itemgetter
def tool_chain(model_output):
    tool_map = {tool.name: tool for tool in tools}
    chosen_tool = tool_map[model_output["name"]]
    return itemgetter("arguments") | chosen_tool

chain = prompt | model | JsonOutputParser()

action_list = []

In [22]:
for task in tasks:
    print(task)
    action = chain.invoke({"input": task})
    action_list.append(action)
    print(action)

Identify areas in London with a population density greater than 20,000 people per square kilometer.
{'name': 'search_population_density_value', 'arguments': {'city_name': 'London', 'value': 20000}}
Determine if each selected area has fewer than 5 fast-food restaurants.
{'name': 'check_within_area', 'arguments': {'area_obj': 'selected_area_object', 'query': 'fast-food', 'condition': '< 5'}}
Verify if each selected area is not within 400 meters of a school.
{'name': 'check_near_by_area', 'arguments': {'area_obj': '<area_object>', 'query': 'school', 'condition': '400m'}}


## Action Orchestration

In [23]:
# task translator
from langchain_core.prompts import ChatPromptTemplate

action_names = ",".join(a['name'] for a in action_list)

system_prompt = f"""You are an assistant that has access to the following tools: 

{rendered_tools}

You are given the list of generated actions from these tools: [{action_names}]
Given the input question, you must determine the order to run these actions to answer the question.
Return your response as a list of JSON format 'order_id' and 'action_name'. Do not explain or give any comments.
"""

#print(system_prompt)

prompt = ChatPromptTemplate.from_messages(
    [("system", system_prompt), ("user", "Question: {input}")]
)
chain = prompt | model | JsonOutputParser()

In [24]:
query = "Find three good locations in London to open a fast food restaurant and they comply with all the relevant regulations"
action_orders = chain.invoke({"input": query})
action_orders

[{'order_id': 1, 'action_name': 'search_population_density_value'},
 {'order_id': 2, 'action_name': 'check_within_area'},
 {'order_id': 3, 'action_name': 'check_near_by_area'}]

In [28]:
import json
action_list_str = "\n".join([json.dumps(action) for action in action_list])
action_list_str

'{"name": "search_population_density_value", "arguments": {"city_name": "London", "value": 20000}}\n{"name": "check_within_area", "arguments": {"area_obj": "selected_area_object", "query": "fast-food", "condition": "< 5"}}\n{"name": "check_near_by_area", "arguments": {"area_obj": "<area_object>", "query": "school", "condition": "400m"}}'

In [29]:
action_orders_str = "\n".join([json.dumps(order) for order in action_orders])
action_orders_str

'{"order_id": 1, "action_name": "search_population_density_value"}\n{"order_id": 2, "action_name": "check_within_area"}\n{"order_id": 3, "action_name": "check_near_by_area"}'

## Executor and Answer Generation

In [34]:
from langgraph.prebuilt import create_react_agent

task_names = "\n".join(tasks)

system_prompt = f"""You already given the following:
Sub-Tasks list:
{task_names}

Actions List:
{action_list_str}

Actions Order:
{action_orders_str}

Please adhere the following instructions:
1. Please only use the provided action name and input arguments to run the tool by provided order.
2. Do not run multiple tools at the same time, only run by the provided order.
3. If the tool result only has true or false, generate your observation using tool description.
For example: check_near_by_area = true means the area is not within a distance with any query object and vice versa.
check_within_area = true means the area has fewer a query object within it and vice versa.
4. Generate the final answer combining all conditions showed by the sub-task list.
"""
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0, api_key=OPENAI_API_KEY)
graph = create_react_agent(llm, tools=tools, state_modifier=system_prompt)

def print_stream(stream):
    for s in stream:
        message = s["messages"][-1]
        if isinstance(message, tuple):
            print(message)
        else:
            message.pretty_print()

In [35]:
query = "Find three good locations in London to open a fast food restaurant and they comply with all the relevant regulations"
inputs = {"messages": [("user", query)]}
print_stream(graph.stream(inputs, stream_mode="values"))



Find three good locations in London to open a fast food restaurant and they comply with all the relevant regulations
Tool Calls:
  search_population_density_value (call_dNH0ihhexLmGbojFO30vvDPc)
 Call ID: call_dNH0ihhexLmGbojFO30vvDPc
  Args:
    city_name: London
    value: 20000
search_population_density_value
area_ids_candidate: [2644, 2643, 2446, 2757, 3215, 2660, 3221, 2559, 2758, 2672, 2673, 2657, 2448, 3207, 2319, 2659, 3320, 2887, 2761, 2774, 2886, 2208, 2558, 2662, 3443, 2320, 2445, 2560, 2661, 2664, 2665, 2561, 2888, 2443, 3117, 2202, 2318, 2775, 2094, 2778, 2646, 2531, 2654, 2431]
Name: search_population_density_value

[{"area_id": 2644, "bbox": [-0.19291666686111952, 51.52458330787413, -0.20125000016112082, 51.53291664117413]}, {"area_id": 2643, "bbox": [-0.20125000016112082, 51.52458330787413, -0.20958333346112035, 51.53291664117413]}, {"area_id": 2446, "bbox": [0.04041666553887957, 51.54124997447413, 0.03208333223888005, 51.54958330777413]}, {"area_id": 2757, "bbox": [-0

Tool Calls:
  check_within_area (call_XO7KGz8Moh5cI16tmhMRUs53)
 Call ID: call_XO7KGz8Moh5cI16tmhMRUs53
  Args:
    area_obj: {'area_id': 3215, 'bbox': [-0.13458333376112108, 51.48291664137413, -0.1429166670611206, 51.49124997467413]}
    query: fast-food
    condition: < 5
check_within_area
Name: check_within_area

true
Tool Calls:
  check_near_by_area (call_hQRljSVctbBy01jl6ljiFWGc)
 Call ID: call_hQRljSVctbBy01jl6ljiFWGc
  Args:
    area_obj: {'area_id': 3215, 'bbox': [-0.13458333376112108, 51.48291664137413, -0.1429166670611206, 51.49124997467413]}
    query: school
    condition: 400m
check_near_by_area
Name: check_near_by_area

false
Tool Calls:
  check_within_area (call_tEEheD1Gq8j1d7mjb4qz292T)
 Call ID: call_tEEheD1Gq8j1d7mjb4qz292T
  Args:
    area_obj: {'area_id': 2660, 'bbox': [-0.05958333406112004, 51.52458330787413, -0.06791666736111956, 51.53291664117413]}
    query: fast-food
    condition: < 5
check_within_area
Name: check_within_area

false
Tool Calls:
  check_within_