# Knowledge Store の処理

## 利用するライブラリをインストール

In [0]:
%pip install databricks-vectorsearch faiss-cpu openai -q
dbutils.library.restartPython()

## 共通設定の読み取りとライブラリのインポート

In [0]:
%run ./00_config

In [0]:
from pyspark.sql import functions as F

## Gold テーブルの確認

In [0]:
curated_table_df = spark.table(f"{catalog_name}.{schema_name}.{curated_table_name}")
curated_table_df.display()

## Databricks の Mosaic AI Vector Search の構築

In [0]:
from databricks.vector_search.client import VectorSearchClient

client = VectorSearchClient()

In [0]:
# Mosaic AI Vector Search のエンドポイントを作成。 10分程度かかります。
try:
    client.get_endpoint(name=vector_search_name)
    print(f"[VS] endpoint '{vector_search_name}' は既存。ONLINE待機…")
    client.wait_for_endpoint(name=vector_search_name)
except Exception:
    print(f"[VS] endpoint '{vector_search_name}' が未作成。作成します…")
    client.create_endpoint_and_wait(
        name=vector_search_name,
        endpoint_type="STANDARD",
    )

In [0]:
# endpoint の確認
import pprint

cle = client.list_endpoints()
fields = ["id", "name", "endpoint_type", "endpoint_status"]
result = [{k: ep.get(k) for k in fields} for ep in cle.get("endpoints", [])]
pprint.pprint(result)

In [0]:
# Curated テーブルのカラム名を指定
prmiary_key = "pk"
embedding_vector_column = "embedding"

# Index が存在したら削除
try:
    client.get_index(
        endpoint_name=vector_search_name,
        index_name=f"{catalog_name}.{schema_name}.{index_table_name}",
    )
    client.delete_index(
        endpoint_name=vector_search_name,
        index_name=f"{catalog_name}.{schema_name}.{index_table_name}",
    )
except:
    pass

In [0]:
# Index の作成
index = client.create_delta_sync_index(
    endpoint_name=vector_search_name,
    source_table_name=f"{catalog_name}.{schema_name}.{curated_table_name}",
    index_name=f"{catalog_name}.{schema_name}.{index_table_name}",
    pipeline_type="TRIGGERED",
    embedding_dimension=1024,
    primary_key=prmiary_key,
    embedding_vector_column=embedding_vector_column,
)

In [0]:
# Index の確認
import pprint


cli = client.list_indexes(name=vector_search_name)
fields = ["endpoint_name", "name", "primary_key", "index_type"]
cli_result = [{k: ep.get(k) for k in fields} for ep in cli.get("vector_indexes", [])]
pprint.pprint(cli_result)

In [0]:
# 出力結果が Index creation succeeded. になるまで待機
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()

idx = w.vector_search_indexes.get_index(
    index_name=f"{catalog_name}.{schema_name}.{index_table_name}",
)
print(idx.status.message)

In [0]:
# Mosaic AI Vector Search 利用時の関数を定義
import os
from openai import OpenAI


def create_embedding(text):
    token = (
        dbutils.notebook.entry_point.getDbutils()
        .notebook()
        .getContext()
        .apiToken()
        .get()
    )

    # Databricks Workspace の URL をセット
    ws_url = "https://"
    ws_url += spark.conf.get("spark.databricks.workspaceUrl")

    client = OpenAI(
        api_key=token,
        base_url=f"{ws_url}/serving-endpoints",
    )

    embeddings = client.embeddings.create(
        input=text,
        model="databricks-bge-large-en",
    )

    return embeddings.data[0].embedding

In [0]:
# Mosaic AI Vector Search から質問内容の回答を取得
# "error_code":"NOT_FOUND" が発生した場合には少し待機してから再実行してください。
query = "Delta Lake ?"
index = client.get_index(index_name=f"{catalog_name}.{schema_name}.{index_table_name}")
all_columns = spark.table(f"{catalog_name}.{schema_name}.{curated_table_name}").columns
results = index.similarity_search(
    query_vector=create_embedding(query), columns=all_columns, num_results=2
)

In [0]:
import pprint

pprint.pprint(results["result"]["data_array"][0][:5])

## Faiss の構築

In [0]:
# データを取得
embedding_col = "embedding"
content_col = "slide_content"
src_faiss_df = spark.table(f"{catalog_name}.{schema_name}.{curated_table_name}")
src_faiss_df = src_faiss_df.select(embedding_col, content_col)
src_faiss_df.display()

In [0]:
# Index の作成
import faiss
import numpy as np
import pandas as pd

# Pandasに変換
pdf = src_faiss_df.toPandas()

# ベクトルとテキストを抽出
embeddings = np.array(pdf[embedding_col].tolist()).astype("float32")
contents = pdf[content_col].tolist()

# ベクトルの次元数を取得
dimension = embeddings.shape[1]

# インデックス作成（L2距離）
index = faiss.IndexFlatL2(dimension)

# ベクトルを追加
index.add(embeddings)

In [0]:
# Faiss の利用
query = "Delta Lake ?"
k = 2
query_vector = np.array(
    create_embedding(query),
    dtype="float32",
).reshape(1, -1)
distances, indices = index.search(query_vector, k)
results = []
for i, idx in enumerate(indices[0]):
    results.append(contents[idx])

In [0]:
print(results[0])

In [0]:
# end