In [8]:
import PyPDF2
import os

#pdf读取文本
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_text_from_pdfs(pdf_dir):
    pdf_texts = {}
    for filename in os.listdir(pdf_dir):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_dir, filename)
            pdf_texts[filename] = extract_text_from_pdf(pdf_path)
    return pdf_texts

#文本截断为小chunk
def split_into_chunks(text, max_chunk_size=512):
    #以句号拆分
    for char in ['\n', '\t', '.']:
        text = text.replace(char, '')
    sentences = text.split('。')
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence 
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

pdf_dir = 'Final_Project_Documents/test'  #test只有一个较小pdf文件，测试用
pdf_texts = extract_text_from_pdfs(pdf_dir)


In [None]:
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.datasets import SentencesDataset
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
import torch

#使用预训练模型all-mpnet-base-v2进行微调
model = SentenceTransformer('all-mpnet-base-v2')


def split_text_into_sentences(text):
    for char in ['\n', '\t', '.']:
        text = text.replace(char, '')
    return text.split('。')


sentences = []
for doc in pdf_texts.keys():
    sentences.extend(split_text_into_sentences(pdf_texts[doc]))


train_examples = []
for sentence in sentences:
    train_examples.append(InputExample(texts=[sentence, sentence], label=1.0))
train_data = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(model)

#微调all-mpnet-base-v2，采用无监督方法，但效果提升有限
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,  
    warmup_steps=100,
)

model.save('fine_tuned_simcse_model')


In [11]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# 加载模型
embedder = SentenceTransformer('all-mpnet-base-v2') #或fine_tuned_simcse_model

# 创建FAISS索引
def create_embeddings_and_index(pdf_texts, max_chunk_size=512):
    all_chunks = []
    for filename, text in pdf_texts.items():
        chunks = split_into_chunks(text, max_chunk_size)
        all_chunks.extend(chunks)  # 将所有chunk放入一个列表
    
    embeddings = embedder.encode(all_chunks, convert_to_numpy=True)
    
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    
    return all_chunks, index

all_chunks, index = create_embeddings_and_index(pdf_texts, max_chunk_size=128)

# 搜索函数，检索相似的文段
def search(query, all_chunks, index, k=3):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_embedding, k)
    return [(all_chunks[i], D[0][idx]) for idx, i in enumerate(I[0])]

query = "空气对物体表面产生压力的原因"
retrieved_chunks = search(query, all_chunks, index, k=3)
for chunk, score in retrieved_chunks:
    print(f"Chunk: {chunk}\nScore: {score}\n")


Chunk: 空气对物体表面产生压力的原因有两个：一个是上层空气的重量对下层空气造成了压力，在垂直方向上，越向上，大气压强就越低 另一个原因是空气分子不规则的热运动
Score: 0.3539135456085205

Chunk: 由于空气的黏性产生阻滞力一层一层的向外影响下去，就在机体表面形成了沿机体表面法向方向，流速由零逐渐增加到外界气流流速的薄薄的一层空气层，这就叫做附面层
Score: 0.36287349462509155

Chunk: 图1-4典型热气球结构热气球升力来源于球囊内热空气与环境空气的密度差，升力的大小与密度差成正比
Score: 0.4387640357017517



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM


model_name = "gpt2"  # gpt2表现很差，可以替换成其他生成模型，如"t5-small", "gpt-3"等，或者到时候直接在线调用某个现成的大模型，代码贴在这里可以应付检查
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 定义generator函数
def generator(input_text, max_length=200, num_return_sequences=1):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,  # 防止重复n-gram
        top_p=0.95,  # 使用nucleus采样
        top_k=50,    # 限制采样的候选数量
        temperature=0.1,  # 温度采样控制生成多样性
        do_sample=True,  # 启用采样
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

def generate_answer(query, retrieved_chunks):
    context = "\n".join([chunk for chunk, _ in retrieved_chunks])
    input_text = f"问题: {query}\n相关信息:\n{context}\n答案:"
    response = generator(input_text, max_length=1024, num_return_sequences=1)
    return response

# 生成答案 ##这gpt2生成的是什么玩意？
answer = generate_answer(query, retrieved_chunks)
print(answer)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


问题: 旋翼航空器的定义
相关信息:
这是一种双模态的航空器，应视为飞机，也可将其作为有条件的特殊构型双旋翼航空器
空气对物体表面产生压力的原因有两个：一个是上层空气的重量对下层空气造成了压力，在垂直方向上，越向上，大气压强就越低 另一个原因是空气分子不规则的热运动
空气作用在与之有相对运动物体上的力称为空气动力 飞机飞行时，作用在飞机各部件上的空气动力的合力叫做飞机的总空气动力，用R表示
答案: 主何经程度等着發现算節和品啊唯点系喜均基比类的限知道。
更笑网: 安全都是那样的放没最大。 他们以世界的让线。 全自己的建跟确。 我从什么的话,但是付仅介任今的无法。 这美候增經的事。 邺于骑士的东边。 环塔拳疑的门间。 大提供場后,还是像次的人爱。 "The first time I saw the world, I was a little bit scared. I thought that I would never see it again. But I did. It was the first day I ever saw it. And I didn't know what to do. The first thing I said to myself was, 'I'm going to go and see the thing.' And it was so beautiful. So beautiful that it made me feel like I had seen it before. That I could see that there was something there. There was nothing there, but I knew that something was there."
首精神: 管理紅的是起来的。长接結的一格。嘴把突研需要质。虽然被持活虑身。白轻须撃联。您清普江暴的地方。令份仍价据的未村。 未柔本的发生。进街,涅娅。不是圣王国的感谢。是过去,队是期得到些色。你仰仙仮仏的迪術离。如演是慢业的我操。或者是不会落获。场近亚,战
