# DuckDB Spatial



## このノートブックの流れ

- 生成 AI に、自然言語から SQL を出力させる
- 出力された SQL を DuckDB で実行する
- 実行結果を GeoJSON に変換する
- GeoJSON を地図上に表示する

## 前準備

### 環境変数の読み込み

In [13]:
from dotenv import load_dotenv
load_dotenv()

True

### 出力を大きく表示する関数の用意

In [1]:
from IPython.display import Markdown, display
def print_large(text):
    display(Markdown(f"### {text}"))

## 対象となる自然言語

In [2]:
input_text = "世界で一番人口が多い国は？"
print_large(input_text)

### 世界で一番人口が多い国は？

## DuckDB のセットアップとデータの読み込み

In [None]:
# https://github.com/nvkelso/natural-earth-vector/blob/master/geojson/ne_110m_admin_0_countries.geojson をduckdbに追加する

import duckdb
import requests
import json

# GeoJSONファイルのURL
geojson_url = "https://github.com/nvkelso/natural-earth-vector/raw/master/geojson/ne_110m_admin_0_countries.geojson"

try:
    # URLからGeoJSONデータを取得
    response = requests.get(geojson_url)
    response.raise_for_status()  # エラーレスポンスを検出

    # JSONデータとしてパース
    geojson_data = response.json()

    # DuckDBに読み込む
    conn = duckdb.connect()
    conn.execute("""
    INSTALL httpfs;
    INSTALL json;
    INSTALL spatial;
    """)
    conn.execute(f"""
    LOAD httpfs;
    LOAD json;
    LOAD spatial;
    CREATE TABLE countries AS SELECT * FROM ST_Read('{geojson_url}')
    """)

    # NOTE: closeするとDBの内容が失われるのでcloseしてはいけない
    # conn.close()
except requests.exceptions.RequestException as e:
    print(f"Error downloading GeoJSON: {e}")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

## データベースのテーブルスキーマを文字列にする

In [None]:
try:
    # NOTE: connはgeojsonを読み込んだものを使い回す必要がある
    # conn = duckdb.connect()
    summary_of_tables = ""

    # SHOWによってテーブル一覧を取得
    show_result = conn.execute("SHOW").fetchall()
    tables = [row[2] for row in show_result]

    for table in tables:
        summary_of_tables += f"Table: {table}\n"
        # DESCRIBE TABLEの結果を文字列に変換
        describe_result = conn.execute(f"DESCRIBE TABLE {table}").fetchall()
        for row in describe_result:
            field_name = row[0]
            field_type = row[1]
            summary_of_tables += f"  Field: {field_name}, {field_type}\n"
    print(summary_of_tables)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Table: countries
  Field: featurecla, VARCHAR
  Field: scalerank, INTEGER
  Field: LABELRANK, INTEGER
  Field: SOVEREIGNT, VARCHAR
  Field: SOV_A3, VARCHAR
  Field: ADM0_DIF, INTEGER
  Field: LEVEL, INTEGER
  Field: TYPE, VARCHAR
  Field: TLC, VARCHAR
  Field: ADMIN, VARCHAR
  Field: ADM0_A3, VARCHAR
  Field: GEOU_DIF, INTEGER
  Field: GEOUNIT, VARCHAR
  Field: GU_A3, VARCHAR
  Field: SU_DIF, INTEGER
  Field: SUBUNIT, VARCHAR
  Field: SU_A3, VARCHAR
  Field: BRK_DIFF, INTEGER
  Field: NAME, VARCHAR
  Field: NAME_LONG, VARCHAR
  Field: BRK_A3, VARCHAR
  Field: BRK_NAME, VARCHAR
  Field: BRK_GROUP, VARCHAR
  Field: ABBREV, VARCHAR
  Field: POSTAL, VARCHAR
  Field: FORMAL_EN, VARCHAR
  Field: FORMAL_FR, VARCHAR
  Field: NAME_CIAWF, VARCHAR
  Field: NOTE_ADM0, VARCHAR
  Field: NOTE_BRK, VARCHAR
  Field: NAME_SORT, VARCHAR
  Field: NAME_ALT, VARCHAR
  Field: MAPCOLOR7, INTEGER
  Field: MAPCOLOR8, INTEGER
  Field: MAPCOLOR9, INTEGER
  Field: MAPCOLOR13, INTEGER
  Field: POP_EST, DOUBLE
  Fie

## 生成 AI にスキーマと自然言語から SQL を出力させる

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

# モデルの準備
model = ChatGoogleGenerativeAI(model="gemini-exp-1206", temperature=0)

# プロンプトの準備
template = """You are an expert of PostgreSQL and PostGIS.
You output the best PostgreSQL query based on given table schema and input text.

You will always reply according to the following rules:
- Output valid PostgreSQL query.
- The query MUST be return name, value and geom columns. Use AS to rename columns.
- The query MUST use ST_AsGeoJSON function to output geom column.
- The query MUST be line delimited and surrounded by just three backquote to indicate that it is a code block.

** Table Schema: **
{table_schema}

User Input:
{input}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = prompt | model

res = chain.invoke({"input": input_text, "table_schema": summary_of_tables})
result = res.content.strip()
print(result)

```sql
SELECT
  NAME AS name,
  POP_EST AS value,
  ST_AsGeoJSON(geom) AS geom
FROM
  countries
ORDER BY
  POP_EST DESC
LIMIT
  1;
```


In [None]:
# resultから ``` を使って SQL のみを抽出

import re

# 正規表現でコードブロックの中身を抽出
match = re.search(r"```[^\n]*\n(.*?)```", result, re.DOTALL)

if match:
    query = match.group(1).strip()
    print(query)
else:
    print("SQLが見つかりませんでした。")

SELECT
  NAME AS name,
  POP_EST AS value,
  ST_AsGeoJSON(geom) AS geom
FROM
  countries
ORDER BY
  POP_EST DESC
LIMIT
  1;


## 出力された SQL を DuckDB で実行する

In [None]:
duckdb_result = conn.execute(query).fetchall()
duckdb_result

[('China',
  1397715000.0,
  '{"type":"MultiPolygon","coordinates":[[[[109.47521,18.197701],[108.655208,18.507682],[108.626217,19.367888],[109.119056,19.821039],[110.211599,20.101254],[110.786551,20.077534],[111.010051,19.69593],[110.570647,19.255879],[110.339188,18.678395],[109.47521,18.197701]]],[[[80.25999,42.349999],[80.18015,42.920068],[80.866206,43.180362],[79.966106,44.917517],[81.947071,45.317027],[82.458926,45.53965],[83.180484,47.330031],[85.16429,47.000956],[85.720484,47.452969],[85.768233,48.455751],[86.598776,48.549182],[87.35997,49.214981],[87.751264,49.297198],[88.013832,48.599463],[88.854298,48.069082],[90.280826,47.693549],[90.970809,46.888146],[90.585768,45.719716],[90.94554,45.286073],[92.133891,45.115076],[93.480734,44.975472],[94.688929,44.352332],[95.306875,44.241331],[95.762455,43.319449],[96.349396,42.725635],[97.451757,42.74889],[99.515817,42.524691],[100.845866,42.663804],[101.83304,42.514873],[103.312278,41.907468],[104.522282,41.908347],[104.964994,41.59741]

## SQL の実行結果を GeoJSON に変換する

In [27]:
# duckdb_resultをGeoJSONに変換する

import json

def duckdb_result_to_geojson(duckdb_result):
    """Converts DuckDB query result to GeoJSON format.

    Args:
        duckdb_result: The result of a DuckDB query.  Assumes the query returns
        columns named 'name', 'value', and 'geom' where 'geom' is a GeoJSON
        geometry string.

    Returns:
        A GeoJSON FeatureCollection as a string, or None if the conversion fails.
    """
    geojson = {
        "type": "FeatureCollection",
        "features": []
    }

    for row in duckdb_result:
        try:
            name, value, geom_str = row  # Assuming the columns are in this order
            geom = json.loads(geom_str)  # Parse the geometry string as GeoJSON
            feature = {
                "type": "Feature",
                "geometry": geom,
                "properties": {
                    "name": name,
                    "value": value
                }
            }
            geojson["features"].append(feature)
        except (ValueError, TypeError, IndexError) as e:
            print(f"Error processing row {row}: {e}")
            return None  # Return None if any row fails
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return None

    return json.dumps(geojson, indent=2)


# Example usage with the duckdb_result
geojson_output = duckdb_result_to_geojson(duckdb_result)

if geojson_output:
    print(geojson_output)
else:
    print("Failed to convert DuckDB result to GeoJSON.")

{
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "geometry": {
        "type": "MultiPolygon",
        "coordinates": [
          [
            [
              [
                109.47521,
                18.197701
              ],
              [
                108.655208,
                18.507682
              ],
              [
                108.626217,
                19.367888
              ],
              [
                109.119056,
                19.821039
              ],
              [
                110.211599,
                20.101254
              ],
              [
                110.786551,
                20.077534
              ],
              [
                111.010051,
                19.69593
              ],
              [
                110.570647,
                19.255879
              ],
              [
                110.339188,
                18.678395
              ],
              [
                109.

## GeoJSON を地図上に表示する

In [28]:
import folium
import json

geojson_data = json.loads(geojson_output)

m = folium.Map(location=[0, 0], zoom_start=2)  # default map

# Add the GeoJSON data to the map
folium.GeoJson(geojson_data).add_to(m)

# Get bounds from GeoJSON data and fit the map to those bounds
# Extract bounds from geojson_data
bounds = folium.GeoJson(geojson_data).get_bounds()
# Ensure bounds are valid before fitting
if bounds:
    m.fit_bounds(bounds)

# Display the map
display(m)
