In [None]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"  # 国内用户加速模型下载
os.environ["HF_ENDPOINT"] = "https://huggingface.byteintl.com"  # 国内用户加速模型下载


# 1. 导入依赖库
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 2. 加载 bge-m3 模型（自动下载缓存，国内可配置镜像）
model = SentenceTransformer("baai/bge-m3")
model.max_seq_length = 512  # 设置文本最大长度

# 1. 加载模型（自动下载并缓存）
model = SentenceTransformer("baai/bge-m3")
model.max_seq_length = 512  # 设置最大文本长度

# 2. 生成 Embedding（支持单条/批量，默认归一化）
text1 = "Elasticsearch 向量搜索使用 HNSW 算法"
embedding1 = model.encode(text1, normalize_embeddings=True)
print(f"单条文本 Embedding 维度：{embedding1.shape}")  # (1024,)

# 批量生成
query="speed of light"
texts = [
    "Science is a powerful tool that allows humans to observe, understand, and explain the world around them. Through centuries of inquiry and experimentation, scientists have uncovered countless facts about the universe, many of which shape the way we live today. Below are some fascinating scientific facts that highlight the beauty and complexity of nature.",
    """# The Universe Is Expanding

One of the most profound discoveries in modern astronomy is that the universe is constantly expanding. In 1929, Edwin Hubble observed that galaxies are moving away from each other, implying that the universe had a beginning. This discovery laid the foundation for the Big Bang theory, which suggests that the cosmos began around 13.8 billion years ago.
""",
    """# DNA: The Blueprint of Life
Deoxyribonucleic acid (DNA) contains the genetic instructions for building and maintaining life. Every living organism on Earth shares this molecular code, though arranged differently. A surprising fact is that humans share about 60% of their DNA with bananas, highlighting the interconnectedness of life on Earth.
""",
    """# The Human Brain’s Complexity
The human brain is considered the most complex structure known in the universe. It contains around 86 billion neurons, each connected to thousands of others, creating trillions of synaptic connections. These networks enable thought, memory, emotions, and consciousness. Remarkably, the brain uses about 20% of the body’s energy, even though it accounts for only 2% of body mass.""",
    """# Water Is Unique
Water has extraordinary properties that make life possible. Unlike most substances, water expands when it freezes, causing ice to float. This prevents bodies of water from freezing solid and allows aquatic life to survive under ice sheets. Water is also a universal solvent, enabling countless chemical reactions within cells.""",
    """# The Speed of Light
Light travels at approximately 299,792 kilometers per second (186,282 miles per second) in a vacuum. To put this into perspective, light from the Sun takes just over eight minutes to reach Earth. This incredible speed makes light the fastest thing in the universe and serves as a fundamental constant in physics.""",
    """# The Earth’s Protective Atmosphere
Earth’s atmosphere shields us from harmful radiation and space debris. The ozone layer, in particular, absorbs most of the Sun’s ultraviolet rays, protecting living organisms from genetic damage. Without the atmosphere, life as we know it would not exist.""",
    """# Evolution Shapes Life
Charles Darwin’s theory of evolution by natural selection remains one of the most important scientific discoveries. Over millions of years, species adapt to their environments, resulting in the incredible diversity of life we see today. For example, whales evolved from land-dwelling mammals that returned to the sea about 50 million years ago.""",
    """# Conclusion
Scientific facts remind us that the universe is vast, life is interconnected, and human knowledge continues to grow. From the tiniest molecule of DNA to the largest galaxies, science reveals the underlying order and beauty of existence. As we uncover more truths, our appreciation of the natural world deepens, inspiring innovation, curiosity, and respect for life."""
]
texts.append("#Overview \n"+"".join(texts))
# 添加英文摘要
texts.append("Science serves as a pivotal tool for humans to observe, comprehend, and explain the natural world. Centuries of scientific inquiry and experimentation have yielded countless discoveries that shape modern life, revealing nature’s beauty and complexity through fascinating facts. This work highlights key scientific insights: the universe is continuously expanding (a 1929 discovery by Edwin Hubble that underpins the Big Bang theory, dating the cosmos to 13.8 billion years ago); DNA, the universal genetic blueprint of life, shows unexpected connections (e.g., humans share 60% of their DNA with bananas); the human brain—with 86 billion neurons and trillions of synaptic connections—uses 20% of the body’s energy despite accounting for only 2% of its mass; water’s unique properties (expanding when freezing, acting as a universal solvent) enable life on Earth; light travels at 299,792 km/s in a vacuum, the fastest speed in the universe, with sunlight reaching Earth in over 8 minutes; Earth’s atmosphere, particularly the ozone layer, shields life from harmful radiation and space debris; and Darwin’s theory of evolution by natural selection explains the diversity of life (e.g., whales evolved from land mammals 50 million years ago). In conclusion, these scientific facts underscore the universe’s vastness, the interconnectedness of life, and the ongoing growth of human knowledge, deepening our appreciation for the natural world and inspiring curiosity and innovation.")
# 添加关键词
texts.append("""DNA
Science
Scientific Facts
Universe
Human Brain
Water
Speed of Light
Earth’s Atmosphere
Evolution
Natural World""")
# 4. 生成 Embedding（Query + 所有 Texts）
query_embedding = model.encode(query, normalize_embeddings=True).reshape(1, -1)  # (1, 1024)
texts_embeddings = model.encode(texts, normalize_embeddings=True)  # (n, 1024)，n 是 texts 长度

print(f"批量 Embedding 形状：{texts_embeddings.shape}")  # (10, 1024)

In [None]:
! uv add matplotlib seaborn ipywidgets jupyterlab-widgets

In [None]:
from matplotlib.font_manager import FontManager, FontProperties
import matplotlib.pyplot as plt

# 获取所有系统字体
fm = FontManager()
all_fonts = {f.name: f.fname for f in fm.ttflist}  # 字体名称: 字体文件路径

# 筛选支持中文的字体（关键词匹配）
chinese_keywords = ['Chinese', 'CJK', '中', '华', '宋', '黑', '楷', '微软', 'YaHei', 'Sim', 'Heiti']
available_chinese_fonts = {}
for font_name, font_path in all_fonts.items():
    if any(keyword in font_name or keyword in font_path for keyword in chinese_keywords):
        available_chinese_fonts[font_name] = font_path

print("系统真实可用的中文字体：")
for name, path in available_chinese_fonts.items():
    print(f"- 字体名称：{name}")
    print(f"  文件路径：{path}")

In [None]:
# 5. 计算 Query 与每个 Text 的余弦相似度
similarities = cosine_similarity(query_embedding, texts_embeddings)[0]  # (n,)，每个元素是相似度得分（0~1）

# 6. 整理数据为 DataFrame（方便可视化）
df = pd.DataFrame({
    "文本内容": texts,
    "与 Query 相似度": similarities
}).sort_values("与 Query 相似度", ascending=False)  # 按相似度降序排序


# 7. 可视化配置（核心修改：更换通用中文字体）
plt.rcParams['font.sans-serif'] = ['Heiti TC']  # 多字体 fallback，确保生效
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
sns.set_style("whitegrid")  # 设置图表风格

# 8. 绘制相似度图表（两种常用类型，选一种或都用）
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))  # 1行2列图表，总宽度16，高度6

# 子图1：水平条形图（推荐，适合文本标签）
colors = sns.color_palette("RdYlBu_r", len(texts))  # 渐变颜色（红色=低相似度，蓝色=高相似度）
bars = ax1.barh(
    y=range(len(df)), 
    width=df["与 Query 相似度"], 
    color=colors,
    alpha=0.8
)
# 设置子图1标签
ax1.set_yticks(range(len(df)))
# ylabel 处理：取前10字符 + ...，确保标签简洁
yticklabels = [
    text[:10] + "..." if len(text) > 10 else text 
    for text in df["文本内容"]
]
ax1.set_yticklabels(yticklabels, fontsize=10)  # 文本标签字体大小
ax1.set_xlabel("余弦相似度得分（0~1，越高越相似）", fontsize=12, fontweight="bold")
ax1.set_title(f"Query: {query}\n文本与 Query 相似度排名", fontsize=14, fontweight="bold", pad=20)
ax1.set_xlim(0, 1)  # x轴范围（0~1，符合相似度取值）

# 在条形图上添加数值标签
for i, (bar, score) in enumerate(zip(bars, df["与 Query 相似度"])):
    ax1.text(
        bar.get_width() + 0.01,  # 数值在条形右侧
        bar.get_y() + bar.get_height()/2,  # 垂直居中
        f"{score:.3f}",  # 保留3位小数
        va="center", fontsize=9, fontweight="bold"
    )

# 子图2：折线图（展示相似度分布趋势）
ax2.plot(
    range(1, len(df)+1),  # x轴：排名（1~n）
    df["与 Query 相似度"], 
    marker="o",  # 标记点为圆形
    linewidth=2.5, 
    markersize=8,
    color="#2E86AB",
    markerfacecolor="#A23B72",  # 标记点填充色
    markeredgecolor="white",
    markeredgewidth=2
)
# 设置子图2标签
ax2.set_xlabel("文本排名（按相似度降序）", fontsize=12, fontweight="bold")
ax2.set_ylabel("余弦相似度得分", fontsize=12, fontweight="bold")
ax2.set_title("相似度分布趋势", fontsize=14, fontweight="bold", pad=20)
ax2.set_xticks(range(1, len(df)+1))
ax2.set_ylim(0, 1)
ax2.grid(True, alpha=0.3)

# 在折线图标记点添加数值
for i, score in enumerate(df["与 Query 相似度"]):
    ax2.text(
        i+1, score + 0.01,
        f"{score:.3f}",
        ha="center", va="bottom",
        fontsize=9, fontweight="bold",
        bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7)  # 黄色背景框
    )

# 调整布局，避免标签重叠
plt.tight_layout()

# 保存图表（可选，保存为高清图片）
plt.savefig("query_text_similarity.png", dpi=300, bbox_inches="tight")

# 显示图表
plt.show()

# 9. 打印相似度数据表格（Notebook 中展示）
print("="*80)
print(f"Query: {query}")
print("="*80)
df_display = df.reset_index(drop=True)
df_display["排名"] = range(1, len(df_display)+1)
df_display = df_display[["排名", "文本内容", "与 Query 相似度"]]
print(df_display.to_string(index=False, float_format=lambda x: f"{x:.3f}"))