In [2]:
import fitz
from openai import OpenAI

## 从PDF中提取文本

In [3]:
def extract_text(path:str)->str:
    """
        path:输入文件的路径
        返回值：
            text:提取出的文本
    """
    doc = fitz.open(path)
    text = ""
    for i in range(doc.page_count):
        page = doc[i]
        text += page.get_text()
    return text

## 提取文本的分块

In [4]:
def chunk_text(text: str, n: int, overlap:int ) -> list[str]:
    """
        text: 从pdf中提取的原始文本
        n: 分块的大小
        overlap: 重复设置的token数

        返回值：
            chunk_list: List[str] 包含文本片段的列表
    """
    chunks_list = []
    for i in range(0, len(text), n-overlap):
        chunks_list.append(text[i:i+n])
    return chunks_list
    

## 初始化OpenAI客户端

In [5]:



client = OpenAI(
    api_key="sk-5b959fe6f3794438a4196cabd7d929e8",
    base_url="https://api.deepseek.com/beta"
    )
client_embedding = OpenAI(
    api_key = "sk-4dd7acf461a84c6e840ccb8a73539e0c",
    base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
)

## 从PDF文件中提取和分块文本

In [6]:
text = extract_text("data/AI_Information.pdf")
chunk_list = chunk_text(text, n = 1500, overlap = 150)
# 打印生成的文本片段的数量
print(f"生成的文本片段数量: {len(chunk_list)}")

# 打印第一个文本片段
print("第一个文本片段:")
print(chunk_list[0])



生成的文本片段数量: 25
第一个文本片段:
Understanding Artificial Intelligence 
Chapter 1: Introduction to Artificial Intelligence 
Artificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot 
to perform tasks commonly associated with intelligent beings. The term is frequently applied to 
the project of developing systems endowed with the intellectual processes characteristic of 
humans, such as the ability to reason, discover meaning, generalize, or learn from past 
experience. Over the past few decades, advancements in computing power and data availability 
have significantly accelerated the development and deployment of AI. 
Historical Context 
The idea of artificial intelligence has existed for centuries, often depicted in myths and fiction. 
However, the formal field of AI research began in the mid-20th century. The Dartmouth Workshop 
in 1956 is widely considered the birthplace of AI. Early AI research focused on problem-solving 
and symbolic methods. The 

## 创建文本块的嵌入

In [7]:
def create_embeddings(text: list[str], model: str = "text-embedding-v2") -> dict:
    """
        创建文本的嵌入向量
        参数：
        text(str): 要创建嵌入向量的文本
        model(str, 可选): 要使用的OpenAI模型，默认值为"text-embedding-ada-002"
        返回：
        dict: 包含嵌入向量的字典
    """
    response = client_embedding.embeddings.create(
        input = text,
        model = model,
    )
    return response


In [8]:
from pprint import pprint
response = create_embeddings(chunk_list)
pprint(response.model_dump())

{'data': [{'embedding': [0.012403097935020924,
                         -0.034879300743341446,
                         -0.013709306716918945,
                         -5.6126151321223006e-05,
                         0.03876262158155441,
                         -0.004280481021851301,
                         0.014638950116932392,
                         0.0017901529790833592,
                         -0.012403097935020924,
                         -0.018639950081706047,
                         -0.0055278511717915535,
                         -0.03217273950576782,
                         -0.004921817686408758,
                         0.039445146918296814,
                         -0.015874553471803665,
                         -0.020640449598431587,
                         0.014709556475281715,
                         0.0022623296827077866,
                         -0.004795315209776163,
                         0.005822042468935251,
                         -0.00307135540060699

In [9]:
import json 

try:
    print(response["data"][0]["embedding"])
except:
    print('-' * 50)
    response_dict = response.model_dump()
    print("embedding的数量:",len(response_dict["data"]))
    print(response_dict["data"][0].keys())
    print("embedding:",response_dict["data"][0]["embedding"])
    print("embedding的维度:", len(response_dict["data"][0]["embedding"]))
    print(response_dict["data"][0]["index"])
    print(response_dict["data"][0]["object"])




--------------------------------------------------
embedding的数量: 25
dict_keys(['embedding', 'index', 'object'])
embedding: [0.012403097935020924, -0.034879300743341446, -0.013709306716918945, -5.6126151321223006e-05, 0.03876262158155441, -0.004280481021851301, 0.014638950116932392, 0.0017901529790833592, -0.012403097935020924, -0.018639950081706047, -0.0055278511717915535, -0.03217273950576782, -0.004921817686408758, 0.039445146918296814, -0.015874553471803665, -0.020640449598431587, 0.014709556475281715, 0.0022623296827077866, -0.004795315209776163, 0.005822042468935251, -0.00307135540060699, 0.017639700323343277, 0.006431018002331257, 0.015297938138246536, -0.001353279105387628, -0.032784659415483475, 0.010608531534671783, -0.0024285477120429277, 0.0017842691158875823, -0.04547017812728882, -0.008913991041481495, -0.001529793837107718, -0.017098387703299522, 0.015792179852724075, 0.014250618405640125, -0.029960423707962036, 0.0023079293314367533, -0.03476162254810333, 0.0295603238046

## 执行语义搜索

In [20]:
import numpy as np
def cosine_similarity(vec1:np.ndarray, vec2:np.ndarray) -> np.float32:
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


In [23]:
def semantic_search(query:str, embedding: list[dict], text_chunks: list[str], n: int = 5) -> list[str]:
    """
        执行语义搜索
        query: 语义搜索的查询
        embedding: 文本块的嵌入列表
        text_chunks: 要搜索的文本块列表
        n: 要返回相关文本块的数量

        return：
        list[str]: 基于查询获得的前n个最相关的列表
    """
    # 为查询创建向量
    query_embedding = create_embeddings(query).data[0].embedding
    similarity_scores = []

    # 计算查询嵌入和每个文本块嵌入之间的相似度
    for i, chunk_embedding in enumerate(embedding):
        similarity_score = cosine_similarity(np.array(query_embedding), np.array(chunk_embedding.embedding))
        similarity_scores.append((i, similarity_score))
    # 按照分数的大小降序排列
    similarity_scores.sort(key = lambda x: x[1], reverse=True)
    # 获得前k个最相似文本块的索引
    top_k_index = [index for index, _ in similarity_scores[:n]]
    # 返回前k个最相似文本块
    top_k_chunks = [text_chunks[index] for index in top_k_index]

    return top_k_chunks



## 对提取的块运行查询

In [27]:
# 加载json验证数据

from openai import embeddings


with open("data/val.json") as f:
    data = json.load(f)

# 加载用户query
query = data[0]['question']
print(query)

# 执行语义搜索找到最相关的两个语义片段
answer_chunks = semantic_search(query, response.data, chunk_list,2)
for i, chunk in enumerate(answer_chunks):
    print(f"Context{i+1}\n{chunk}\n-----------------")

What is 'Explainable AI' and why is it considered important?
Context1
ent 
investments drive innovation and foster collaboration. 
International Cooperation 
International cooperation is essential for addressing the global challenges and opportunities 
presented by AI. This includes sharing knowledge, developing standards, and promoting 
responsible AI practices across borders. 
Public Engagement and Education 
Engaging the public in discussions about AI is crucial for building trust and ensuring that AI 
development aligns with societal values. Education and awareness campaigns inform the public 
about AI, its impacts, and its potential. 
Chapter 19: AI and Ethics 
Principles of Ethical AI 
Ethical AI principles guide the development and deployment of AI systems to ensure they are fair, 
transparent, accountable, and beneficial to society. Key principles include respect for human 
rights, privacy, non-discrimination, and beneficence. 
 
 
Addressing Bias in AI 
AI systems can inherit 

## 基于检索块生成回复

In [30]:
from fitz import message


system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model="deepseek-chat"):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages = [
            {"role": "system","content":system_prompt},
            {"role": "user", "content": user_message}
        ],
        temperature=0
    )
    return response

# 基于top片段创建用户提示
user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(answer_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

# 生成AI回复
ai_response = generate_response(system_prompt, user_prompt)
print(ai_response.choices[0].message.content)

Based on the provided context, 'Explainable AI' (XAI) refers to techniques that aim to make AI decisions more understandable. It is considered important because transparency and explainability are essential for building trust in AI systems, enabling users to assess their fairness and accuracy.

**Answer:**  
Explainable AI (XAI) comprises techniques designed to make the decisions of AI systems more understandable. It is important because transparency and explainability are essential for building trust in AI, allowing users to evaluate the fairness and accuracy of AI decisions.


## 评估AI回复

In [35]:
# 定义评估系统的系统提示
evaluate_system_prompt = '''
You are an intelligent evaluation system tasked with assessing the AI assistant's responses.\
    If the AI assistant's response is very close to the true response, assign a score of 1. \
    If the response is incorrect or unsatisfactory in relation to the true response, assigna score of 0. 
    If the response is partially aligned with the true response, assign a score of 0.5.
'''

# 通过组合用户查询、AI回复、真实答复以及评估系统提示组成评估提示
evluate_prompt = f"User Query: {query}\nAI Response: {ai_response.choices[0].message.content}\nReal Answer: {data[0]['ideal_answer']}\n{evaluate_system_prompt}"

evaluation_res = generate_response(evaluate_system_prompt, evluate_prompt)
print(evaluation_res.choices[0].message.content)

1
