# カタログやテーブルなどを作成

## 利用するライブラリをインストール

In [0]:
%pip install databricks-vectorsearch -q
dbutils.library.restartPython()

## 共通設定の読み取り

In [0]:
%run ./00_config

## カタログとスキーマを作成

In [0]:
# カタログ作成。エラーとなった場合には、手動で作成してください。
catalog_ddl = f"CREATE CATALOG IF NOT EXISTS {catalog_name}"
print(catalog_ddl)
_ = spark.sql(catalog_ddl)

In [0]:
# スキーマ作成
schema_ddl = f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}"
print(schema_ddl)
_ = spark.sql(schema_ddl)

## Volume の作成とデータの準備

In [0]:
# Volume 作成
volume_ddl = f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.{volume_name}"
print(volume_ddl)
_ = spark.sql(volume_ddl)

# フォルダを作成
print("source_dir: ", source_dir)
dbutils.fs.mkdirs(source_dir)

In [0]:

import os
import shutil

# 現在の workspace の source_data 配下のオブジェクトを Volume のフォルダにコピー
current_dir = os.getcwd()
data_source_dir = os.path.join(current_dir, "source_data")
print(os.listdir(data_source_dir))
shutil.copytree(data_source_dir, py_source_dir, dirs_exist_ok=True)

In [0]:
# Volume に書かれたことを確認
print("-- medalion_site.metadata.json")
metadata_file_path = source_dir + "/landing/file_context"
metadata_file_path += "/audit__ingest_timestamp=2025-10-14T10:07:33Z"
metadata_file_path += "/medalion_site.metadata.json"
print(dbutils.fs.head(metadata_file_path))

## テーブルを作成

In [0]:
file_context_input_table_ddl = f"""
CREATE OR REPLACE TABLE {catalog_name}.{schema_name}.{file_context_input_table_name}
(
    file_id STRING,
    file_version STRING,
    file_dir STRING,
    file_name STRING,
    file_sensitivity_label STRING,
    file_url STRING,
    file_extension STRING,
    file_mime_type STRING,
    size_in_bytes BIGINT,
    raw_file_path STRING,
    raw_file_path_by_service STRUCT < 
        databricks_volumes_file_path: STRING 
    >,
    file_created_timestamp TIMESTAMP,
    file_update_timestamp TIMESTAMP,
    value STRING,
    audit__ingest_timestamp TIMESTAMP,
    audit__update_timestamp TIMESTAMP,
    audit__delete_flg INT,
    audit__source_delete_flg INT
)
TBLPROPERTIES (
    delta.enableRowTracking = true
)
"""
_ = spark.sql(file_context_input_table_ddl)

In [0]:
file_info_table_ddl = f"""
CREATE OR REPLACE TABLE {catalog_name}.{schema_name}.{file_info_table_name}
(
    row_id LONG,
    raw_file_path STRING,
    file_check STRUCT<
        file_corrupt BOOLEAN
    >,
    file_md_content ARRAY<STRUCT<
        slide_number INT,
        slide_content STRING
    >>,
    audit__update_timestamp TIMESTAMP,
    audit__delete_flg INT
)
TBLPROPERTIES (
    delta.enableRowTracking = true
)
"""
_ = spark.sql(file_info_table_ddl)

In [0]:
file_context_output_table_ddl = f"""
CREATE OR REPLACE TABLE {catalog_name}.{schema_name}.{file_context_output_table_name}
(
    file_id STRING,
    file_version STRING,
    file_dir STRING,
    file_name STRING,
    file_sensitivity_label STRING,
    file_url STRING,
    file_extension STRING,
    file_mime_type STRING,
    size_in_bytes BIGINT,
    raw_file_path STRING,
    raw_file_path_by_service STRUCT <
        databricks_volumes_file_path: STRING
    >,
    file_path STRING,
    file_path_by_service STRUCT <
        databricks_volumes_file_path: STRING
    >,
    file_created_timestamp TIMESTAMP,
    file_update_timestamp TIMESTAMP,
    value STRING,
    file_check STRUCT < 
        file_corrupt: BOOLEAN
    >,
    file_md_content ARRAY<STRUCT<
        slide_number INT,
        slide_content STRING
    >>,
    audit__ingest_timestamp TIMESTAMP,
    audit__update_timestamp TIMESTAMP,
    audit__delete_flg INT,
    audit__source_delete_flg INT
)
TBLPROPERTIES (
    delta.enableRowTracking = true
)
"""
_ = spark.sql(file_context_output_table_ddl)

In [0]:
enriched_table_name_ddl = f"""
CREATE OR REPLACE TABLE {catalog_name}.{schema_name}.{enriched_table_name}
(
    file_id STRING,
    file_version STRING,
    file_dir STRING,
    file_name STRING,
    file_sensitivity_label STRING,
    file_url STRING,
    file_extension STRING,
    file_mime_type STRING,
    size_in_bytes BIGINT,
    raw_file_path STRING,
    raw_file_path_by_service STRUCT <
        databricks_volumes_file_path: STRING
    >,
    file_path STRING,
    file_path_by_service STRUCT <
        databricks_volumes_file_path: STRING
    >,
    file_created_timestamp TIMESTAMP,
    file_update_timestamp TIMESTAMP,
    value STRING,
    file_check STRUCT < 
        file_corrupt: BOOLEAN
    >,
    file_md_content ARRAY<STRUCT<
        slide_number INT,
        slide_content STRING
    >>,
    audit__ingest_timestamp TIMESTAMP,
    audit__update_timestamp TIMESTAMP,
    audit__delete_flg INT,
    audit__source_delete_flg INT
)
TBLPROPERTIES (
    delta.enableRowTracking = true
)
"""
_ = spark.sql(enriched_table_name_ddl)

In [0]:
enriched_table_name_ddl = f"""
CREATE OR REPLACE TABLE {catalog_name}.{schema_name}.{enriched_table_name}
(
    file_id STRING,
    file_version STRING,
    file_dir STRING,
    file_name STRING,
    file_sensitivity_label STRING,
    file_url STRING,
    file_extension STRING,
    file_mime_type STRING,
    size_in_bytes BIGINT,
    raw_file_path STRING,
    raw_file_path_by_service STRUCT <
        databricks_volumes_file_path: STRING
    >,
    file_path STRING,
    file_path_by_service STRUCT <
        databricks_volumes_file_path: STRING
    >,
    file_created_timestamp TIMESTAMP,
    file_update_timestamp TIMESTAMP,
    value STRING,
    file_check STRUCT < 
        file_corrupt: BOOLEAN
    >,
    file_md_content ARRAY<STRUCT<
        slide_number INT,
        slide_content STRING
    >>,
    audit__ingest_timestamp TIMESTAMP,
    audit__update_timestamp TIMESTAMP,
    audit__delete_flg INT,
    audit__source_delete_flg INT
)
TBLPROPERTIES (
    delta.enableRowTracking = true
)
"""
_ = spark.sql(enriched_table_name_ddl)

In [0]:
curated_table_name_ddl = f"""
CREATE OR REPLACE TABLE {catalog_name}.{schema_name}.{curated_table_name}
(
    pk STRING,
    file_id STRING,
    slide_number INT,
    file_url STRING,
    image_path STRING,
    slide_content STRING,
    embedding ARRAY < DOUBLE >
)
TBLPROPERTIES (
    delta.enableRowTracking = true,
    delta.enableChangeDataFeed = true
)
"""
_ = spark.sql(curated_table_name_ddl)

## Mosaic AI Vector Search のエンドポイントを作成

In [0]:
# Mosaic AI Vector Search のエンドポイントを作成。 10分程度かかるため、実行したまま次のノートブックに進んでください。
from databricks.vector_search.client import VectorSearchClient

client = VectorSearchClient()

try:
    client.get_endpoint(name=vector_search_name)
    print(f"[VS] endpoint '{vector_search_name}' は既存。ONLINE待機…")
    client.wait_for_endpoint(name=vector_search_name)
except Exception:
    print(f"[VS] endpoint '{vector_search_name}' が未作成。作成します…")
    client.create_endpoint_and_wait(
        name=vector_search_name,
        endpoint_type="STANDARD",
    )

In [0]:
 # end