# 1. Setup
Ultralytics Explorer API 
探索数据集并利用语义搜索的强大功能, 使用向量搜索甚至 SQL 查询来检查特定类型的标签
* 相似性搜索 vector search + semantic search : 查找数据集中与给定数据点相似的图像。
* 自然语言搜索 NL: 使用自然语言描述您要查找的数据点。
* SQL 查询queries: 使用 SQL 查询对数据集执行复杂的数据分析。
* 相似度指数 index: 估计每个数据点与数据集中其余部分的相似程度。

In [None]:
# 导入必要库
%pip install ultralytics[explorer] openai
import ultralytics
ultralytics.checks()

In [None]:
from ultralytics import Explorer

# Similarity search

In [8]:
# 创建一个 Explorer 对象并加载您的数据集
exp = Explorer("VOC.yaml", model="yolov8n.pt")
# 创建嵌入表
exp.create_embeddings_table()

[31m[1mrequirements:[0m Ultralytics requirements ['lancedb>=0.4.3', 'duckdb<=0.9.2'] not found, attempting AutoUpdate...


In [None]:
# 根据索引搜索相似图像（相似性搜索）
similar = exp.get_similar(idx=1, limit=10)
# similar = exp.get_similar(img=["path/to/img1", "path/to/img2"], limit=10)
similar.head()

In [None]:
# 绘制相似样本
exp.plot_similar(idx=6500, limit=20)
#exp.plot_similar(idx=[100,101], limit=10) # Can also pass list of idxs or imgs


In [None]:
exp.plot_similar(img="https://ultralytics.com/images/bus.jpg", limit=10, labels=False) # Can also pass any external images

# 2. Ask AI: Search or filter with Natural Language

In [None]:
# 使用 ask_ai() 函数 自然语言搜索数据
df = exp.ask_ai("show me images containing more than 10 objects with at least 2 persons")
df.head(5)

In [None]:
# plot 绘制搜索结果
from ultralytics.data.explorer import plot_query_result
from PIL import Image

plt = plot_query_result(exp.ask_ai("show me 10 images containing exactly 2 persons"))
Image.fromarray(plt)

# 3. Run SQL queries on your Dataset!

In [None]:
# 使用 sql_query() 函数对数据集执行 SQL 查询
table = exp.sql_query("WHERE labels LIKE '%person, person%' AND labels LIKE '%dog%' LIMIT 10")
table

In [None]:
exp.plot_sql_query("WHERE labels LIKE '%person, person%' AND labels LIKE '%dog%' LIMIT 10", labels=True)

In [None]:
# Working with embeddings Table (Advanced)   LanceDB tables
# 使用 Explorer.table 对象直接访问底层 LanceDB 嵌入表
# 运行原始查询、推送前置和后置过滤器等
table = exp.table
table.schema

In [None]:
# LanceDB 中，一个 Metric 就是对向量对之间距离的描述方式
# L2
# Cosine 余弦
# Dot Explorer's similarity search uses L2 by default. 

dummy_img_embedding = [i for i in range(256)]
# 运行原始查询 raw quiery
table.search(dummy_img_embedding).limit(5).to_pandas()

In [None]:
# Inter-conversion to popular data formats
df = table.to_pandas()
pa_table = table.to_arrow()

In [None]:
# the raw embedding from lancedb Table and analyse it 图片嵌入存储在列 vector 中
import numpy as np

embeddings = table.to_pandas()["vector"].tolist()
embeddings = np.array(embeddings)

In [None]:
# 通过降维2D 空间中绘制Scatterplot 散点图
%pip install scikit-learn --q

In [None]:
%matplotlib inline
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Reduce dimensions using PCA to 3 components for visualization in 3D
pca = PCA(n_components=3)
reduced_data = pca.fit_transform(embeddings)

# Create a 3D scatter plot using Matplotlib's Axes3D
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
ax.scatter(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], alpha=0.5)
ax.set_title('3D Scatter Plot of Reduced 256-Dimensional Data (PCA)')
ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 3')

plt.show()

# 4. Similarity Index 相似度指数

In [None]:
# 估计每个数据点与数据集的其他部分是相似的程度, 
# 计算在生成的嵌入空间中，有多少图像嵌入位于当前图像的 max_dist 距离内来实现的，每次考虑 top_k 个相似图像
# embeddings lie closer than max_dist to the current image in the generated embedding space, 
# considering top_k similar images at a time.
exp.plot_similarity_index(max_dist=0.2, top_k=0.01)

In [None]:
# 绘制相似度指数
import numpy as np

sim_idx = exp.similarity_index(max_dist=0.2, top_k=0.01, force=False)

In [None]:
sim_idx

In [None]:
import numpy as np

sim_count = np.array(sim_idx["count"])
sim_idx['im_file'][sim_count > 30]

In [None]:
exp.plot_similar(idx=[7146, 14035]) # Using avg embeddings of 2 images