## Langchain

LangChain是一个强大的框架，旨在帮助开发人员使用语言模型构建端到端的应用程序。它提供了一套工具、组件和接口，可简化创建由大型语言模型 (LLM) 和聊天模型提供支持的应用程序的过程。LangChain 可以轻松管理与语言模型的交互，将多个组件链接在一起，并集成额外的资源

In [None]:
!pip install langchain faiss-cpu peft
!pip install transformers sentence_transformers sentencepiece cpm_kernels llama-cpp-python
!pip install google-search-results -i pypi.douban.com/simple --trusted-host pypi.douban.com

### LLM

In [None]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(model_path="models/chinese-alpaca-7b-int4/ggml-model-q4_0.bin", callback_manager=callback_manager, verbose=True)

template = """Question: {question}
Answer: Let's work this out in a step by step way to be sure we have the right answer."""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
llm_chain.run("What NFL team won the Super Bowl in the year Justin Bieber was born?")

### google-search

In [None]:
from langchain.agents import load_tools
import os

os.environ["SERPAPI_API_KEY"] = '你的api key'

tools = load_tools(["serpapi", "python_repl", "llm-math"], llm=llm)
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
agent.run("当前金价是多少一克？")

### prompt 提示
填入内容来引导大模型输出

In [None]:
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

template = """什么是{query},还有如何真正做到并细说实现步骤"""
prompt_tem = PromptTemplate(input_variables=["query"], template=template)
prompt = prompt_tem.format(query='阶级跳跃')

response, history = llm(prompt=prompt, history=[])
print(prompt, '-->', response)

In [None]:
system_template = "你是一个把{input_language}翻译成{output_language}的助手"
system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
human_template = "{text}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
messages = chat_prompt.format_prompt(input_language="英语", output_language="汉语", text="I love programming.")

messages.to_messages()

### Chains 链
链接多个组件处理一个特定的下游任务

In [None]:
from langchain.chains.base import Chain

class DemoChain():
    def __init__(self, llm, prompt, history) -> None:
        self.llm = llm
        self.prompt = prompt
        self.history = history

    def run(self, query, history, context=None) -> Any:
        if context is not None:
            prompt = self.prompt.format(query=query, context=context)
        else:
            prompt = self.prompt.format(query=query)

        response, history = self.llm(prompt, history)
        return response, history

chain = DemoChain(llm=llm, prompt=prompt_tem, history=[])
response, history = chain.run(query="阶级跳跃", history=[])
print(response, history)

chain = DemoChain(llm=llm, prompt=prompt_tem, history=[])
response, history = chain.run(query="阶级跳跃", history=[])
print(response, history)

### 文档加载

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import JSONLoader                    #pip install jq
from langchain.document_loaders import UnstructuredMarkdownLoader    #pip install unstructured
from langchain.document_loaders import PyPDFLoader                   #pip install pypdf
from langchain.document_loaders import NotebookLoader
from langchain.document_loaders import DirectoryLoader

# loader = CSVLoader(file_path='./example_data/mlb_teams_2012.csv')
# loader = UnstructuredHTMLLoader("example_data/fake-content.html")
# loader = JSONLoader(file_path='./example_data/facebook_chat.json', jq_schema='.messages[].content')
# loader = UnstructuredMarkdownLoader('../README.md')
# loader = PyPDFLoader("example_data/layout-parser-paper.pdf")
# loader = NotebookLoader("example_data/notebook.ipynb")

# loader = DirectoryLoader('data/pdf', glob="*.pdf", loader_cls=PyPDFLoader, show_progress=True, use_multithreading=True)
loader = DirectoryLoader('data/pdf', glob="*.PDF", loader_cls=PyPDFLoader, show_progress=True, use_multithreading=True)
documents = loader.load()

documents[0]

In [None]:
# 文本分割器
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language       #代码分割
from langchain.text_splitter import MarkdownHeaderTextSplitter

# with open('data/ner/test.txt') as f:
#     state_of_the_union = f.read()

# text_splitter = CharacterTextSplitter(separator = "\n\n", chunk_size = 1000,chunk_overlap  = 200,length_function = len)
# texts = text_splitter.create_documents([state_of_the_union])

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# text_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=50, chunk_overlap=0)
# texts = text_splitter.create_documents(documents)

texts[0]

### Embedding
外部信息编码成一个高维向量

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.embeddings import LlamaCppEmbeddings

# embeddings = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese")
embeddings = LlamaCppEmbeddings(model_path="model/chinese-alpaca-7b-int4/ggml-model-q4_0.bin")

query_result = embeddings.embed_query("阶级跳跃")

In [None]:
#SentenceTransformers 文本和图像嵌入，语义文本相似性、语义搜索和同义词挖掘
import sentence_transformers #pip install -U sentence-transformers

embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name, device='mps')

In [None]:
# 向量库与检索
from langchain.vectorstores import Chroma, FAISS                            #pip install faiss-cpu

db = Chroma.from_documents(split_docs, embeddings,persist_directory="./vectors/chroma")
db.persist()
db = Chroma(persist_directory="./vectors/chroma", embedding_function=embeddings)
db = FAISS.from_documents(split_docs, embeddings)
db.save_local("./vectors/faiss")
db = FAISS.load_local("./vectors/faiss",embeddings=embeddings)

vector_store = FAISS.load_local(vs_path, embeddings)
related_docs_with_score = vector_store.similarity_search_with_score(query="阶级跳跃", k=2)
context = ""
for pack in related_docs_with_score:
    doc, socre = pack
    content = doc.page_content
    print("检索到的知识=%s, from=%s, socre=%.3f"%(content, doc.metadata.get("from"), socre))
    context += content

### 文档问答

In [None]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())
qa.run("2022年腾讯营收多少")

### agents代理

In [None]:
# pip install wikipedia
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType

tools = load_tools(["wikipedia", "llm-math"], llm=llm)
agent = initialize_agent(tools,
                         llm,
                         agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                         verbose=True)


agent.run("奥巴马的生日是哪天? 到2023年他多少岁了?")

# 实例

## 信息抽取

In [None]:
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModel
from langchain.chains import SimpleSequentialChain, SequentialChain
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.base import Chain
from typing import Dict, List

In [None]:
class CustomLLM(LLM):
    def __init__(self, model_name, quantization_bit=4):
        print('----1', model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        print('----2', model_name)
        self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True).half().cuda().eval()
        # self.model = self.model.quantize(quantization_bit)

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
    ) -> str:
        response, history = self.model.chat(self.tokenizer, prompt)
        return response, history

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {}

In [None]:
llm = CustomLLM(model_name="THUDM/chatglm-6B-int4")
llm("This is a foobar thing")

In [None]:
text = "阿尔茨海默病(Alzheimer's disease, AD),俗称老年痴呆症,是一种全身性神经退行性疾病，它是由大脑神经退行性变性引起的，\
主要表现为记忆力减退、思维能力减退、行为变化等。阿尔茨海默病的原因尚不十分清楚，但是研究表明，阿尔茨海默病可能与遗传因素、环境因素、\
营养不良、压力过大、不良生活习惯等有关。根据世界卫生组织的统计数据，全球有超过4700万人患有阿尔茨海默病，其中美国有超过600万人患有阿尔茨海默病，\
欧洲有超过1000万人患有阿尔茨海默病，亚洲有超过2500万人患有阿尔茨海默病，其中中国有超过1000万人患有阿尔茨海默病。阿尔茨海默病的发病率与年龄有关，\
随着年龄的增长而增加，65岁以上的人群为主要受害群体，占比高达80%，其中45-65岁的人群占比为15%，20-45岁的人群占比为5%。65岁以上的人群发病率约为10%，\
75岁以上的人群发病率约为20%，85岁以上的人群发病率约为30%。根据统计，男性患病率高于女性，男性患病比例为1.4：1，即男性患病率比女性高出40%。\
根据统计，阿尔茨海默病在不同的人种中分布情况也有所不同。白人患病率最高，占总患病率的70%，黑人患病率次之，占总患病率的20%，\
其他少数民族患病率最低，占总患病率的10%。阿尔茨海默病在不同的饮食习惯中分布情况也有所不同。维生素B12缺乏的人群患病率更高，\
而均衡膳食的人群患病率较低。阿尔茨海默病不仅会给患者带来记忆力减退、思维能力减退、行为变化等症状，还会给患者的家庭带来巨大的心理负担。\
因此，患者应尽快就医，及时进行治疗。治疗阿尔茨海默病的方法有药物治疗、行为治疗、认知行为治疗等，具体治疗方案要根据患者的具体情况而定。"

#创建模板
fact_extraction_prompt = PromptTemplate(
    input_variables=["text_input"],
    template="从下面的本文中提取关键事实。尽量使用文本中的统计数据来说明事实:\n\n {text_input}"
)

#定义chain
fact_extraction_chain = LLMChain(llm=llm, prompt=fact_extraction_prompt)
facts = fact_extraction_chain.run(text)
print(facts)

In [None]:
class ConcatenateChain(Chain):
    chain_1: LLMChain
    chain_2: LLMChain

    @property
    def input_keys(self) -> List[str]:
        # Union of the input keys of the two chains.
        all_input_vars = set(self.chain_1.input_keys).union(set(self.chain_2.input_keys))
        return list(all_input_vars)

    @property
    def output_keys(self) -> List[str]:
        return ['concat_output']

    def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
        output_1 = self.chain_1.run(inputs)
        output_2 = self.chain_2.run(inputs)
        return {'concat_output': output_1 + output_2}

prompt_1 = PromptTemplate(
    input_variables=["product"],
    template="What is a good name for a company that makes {product}?",
)
chain_1 = LLMChain(llm=llm, prompt=prompt_1)

prompt_2 = PromptTemplate(
    input_variables=["product"],
    template="What is a good slogan for a company that makes {product}?",
)
chain_2 = LLMChain(llm=llm, prompt=prompt_2)

concat_chain = ConcatenateChain(chain_1=chain_1, chain_2=chain_2)
concat_output = concat_chain.run("colorful socks")
print(f"Concatenated output:\n{concat_output}")


concat_chain = ConcatenateChain(chain_1=chain_1, chain_2=chain_2)
concat_output = concat_chain.run("colorful socks")


In [None]:
text = "阿尔茨海默病(Alzheimer's disease, AD),俗称老年痴呆症,是一种全身性神经退行性疾病，它是由大脑神经退行性变性引起的，\
主要表现为记忆力减退、思维能力减退、行为变化等。阿尔茨海默病的原因尚不十分清楚，但是研究表明，阿尔茨海默病可能与遗传因素、环境因素、\
营养不良、压力过大、不良生活习惯等有关。根据世界卫生组织的统计数据，全球有超过4700万人患有阿尔茨海默病，其中美国有超过600万人患有阿尔茨海默病，\
欧洲有超过1000万人患有阿尔茨海默病，亚洲有超过2500万人患有阿尔茨海默病，其中中国有超过1000万人患有阿尔茨海默病。阿尔茨海默病的发病率与年龄有关，\
随着年龄的增长而增加，65岁以上的人群为主要受害群体，占比高达80%，其中45-65岁的人群占比为15%，20-45岁的人群占比为5%。65岁以上的人群发病率约为10%，\
75岁以上的人群发病率约为20%，85岁以上的人群发病率约为30%。根据统计，男性患病率高于女性，男性患病比例为1.4：1，即男性患病率比女性高出40%。\
根据统计，阿尔茨海默病在不同的人种中分布情况也有所不同。白人患病率最高，占总患病率的70%，黑人患病率次之，占总患病率的20%，\
其他少数民族患病率最低，占总患病率的10%。阿尔茨海默病在不同的饮食习惯中分布情况也有所不同。维生素B12缺乏的人群患病率更高，\
而均衡膳食的人群患病率较低。阿尔茨海默病不仅会给患者带来记忆力减退、思维能力减退、行为变化等症状，还会给患者的家庭带来巨大的心理负担。\
因此，患者应尽快就医，及时进行治疗。治疗阿尔茨海默病的方法有药物治疗、行为治疗、认知行为治疗等，具体治疗方案要根据患者的具体情况而定。"


#创建模板
fact_extraction_prompt = PromptTemplate(
    input_variables=["query"],
    template="从下面的本文中提取关键事实。尽量使用文本中的统计数据来说明事实:\n\n {query}"
)

fact_extraction_chain = DemoChain(llm=llm, prompt=fact_extraction_prompt, history=[])
response, history = fact_extraction_chain.run(query=text, history=[])
print(response, history)

In [None]:
doctor_prompt = PromptTemplate(
    input_variables=["query"],
    template="你是神经内科医生。根据以下阿尔茨海默病的事实统计列表，为您的病人写一个简短的预防阿尔茨海默病的建议。 不要遗漏关键信息：\n\n {query}"
)

doctor_chain = DemoChain(llm=llm, prompt=doctor_prompt, history=history)
response, history = doctor_chain.run(query=response, history=history)
print(response, history)

In [None]:
#定义SimpleSequentialChain
full_chain = SimpleSequentialChain(chains=[fact_extraction_chain, doctor_chain], verbose=True)
response, history = full_chain.run(text)
print(response, history)


## 知识库智能问答

In [None]:
!pip install langchain tiktoken pypdf faiss-cpu chromadb ctransformers sentence-transformers llama-cpp-python -q
!pip install torch tabulate tqdm transformers accelerate sentencepiece chatglm-cpp pydantic-settings -q

In [None]:
import torch
EMBEDDING_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
if torch.cuda.is_available():
    !pip install ctransformers[cuda]
EMBEDDING_DEVICE

In [None]:
from langchain import PromptTemplate
from langchain.vectorstores import FAISS, Chroma
from langchain.chains import LLMChain
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.llms import CTransformers
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

**1 文档加载**

In [None]:
loader = DirectoryLoader('data/pdf/', glob="*.PDF", loader_cls=PyPDFLoader, show_progress=True, use_multithreading=True)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

len(texts)

**2 embeddings**

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese", model_kwargs={'device': EMBEDDING_DEVICE})

In [None]:
# !pip install llama-cpp-python
# from langchain.embeddings import LlamaCppEmbeddings

# embeddings = LlamaCppEmbeddings(model_path="models/Chinese-Llama-2-7b-ggml-q4.bin")
# embeddings.embed_query(text)

**3 vector save or load**


In [None]:
vectordb = FAISS.from_documents(texts, embeddings)
vectordb.save_local('vectordb/db_faiss')
# FAISS.load_local('vectordb/db_faiss', embeddings)

In [None]:
vectordb = FAISS.load_local("vectordb/db_faiss", embeddings)
vectordb.similarity_search("非流动资产处置损益", k=1)

**4 load llm**

In [None]:
# !wget https://huggingface.co/TheBloke/Chinese-Alpaca-2-7B-GGUF/resolve/main/chinese-alpaca-2-7b.Q4_K_M.gguf
!wget https://huggingface.co/Sanfor/chatglm-3-quantize/resolve/main/chatglm3-6b-q4_0-ggml.bin

Llama2

In [None]:
# from langchain.llms import LlamaCpp
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# llm = LlamaCpp(
#     model_path='chinese-alpaca-2-7b.Q6_K.gguf',
#     n_ctx=2048,
#     callbacks=[StreamingStdOutCallbackHandler()],
#     verbose=False
#     )

In [None]:
llm = CTransformers(
    model='chinese-alpaca-2-7b.Q4_K_M.gguf',
    model_type='llama',
    config={'max_new_tokens': 256, 'repetition_penalty': 1.1, 'temperature': 0.1, 'stream': True},
    callbacks=[StreamingStdOutCallbackHandler()],
    # gpu_layers=110 #110 for 7b, 130 for 13b
    )
llm("请列举5条文明乘车的建议")

ChatGLM3

In [None]:
# import chatglm_cpp

# pipeline = chatglm_cpp.Pipeline("chatglm3-6b-q4_0-ggml.bin")
# pipeline.chat(["你好"])

In [None]:
%%script bash
CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install -U chatglm-cpp

In [None]:
%%script bash --bg
MODEL=./chatglm3-6b-q4_0-ggml.bin uvicorn chatglm_cpp.langchain_api:app --host 127.0.0.1 --port 8000 &

In [None]:
from langchain.llms import ChatGLM
llm = ChatGLM(endpoint_url="http://127.0.0.1:8000")
llm.predict("你好")

**5 PromptTemplate**

In [None]:
template ="""根据下面上下文（context）内容回答问题。
如果你不知道答案，就回答不知道，不要试图编造答案。
答案最多3句话，保持答案简洁。
总是在答案结束时说“谢谢你的提问！”
{context}
问题：{question}
答案："""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

**6 检索问答**

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese", model_kwargs={'device': EMBEDDING_DEVICE})
vectordb = FAISS.load_local("vectordb/db_faiss", embeddings)

# RetrievalQAWithSourcesChain
qa_chat = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectordb.as_retriever(search_kwargs={'k':2}),
    return_source_documents=True,
    chain_type_kwargs={'prompt': QA_CHAIN_PROMPT}
    )

qa_chat

In [None]:
qa_chat("迈瑞医疗的研发副总经理是谁？")

## 翻译


In [None]:
!wget https://huggingface.co/Sanfor/chatglm-3-quantize/resolve/main/chatglm3-6b-q4_0-ggml.bin
!pip install langchain pdfplumber PyPDF2 chatglm-cpp reportlab pdfminer.six uvloop>=0.14.0 -q

In [None]:
!wget https://huggingface.co/Sanfor/chatglm-3-quantize/resolve/main/chatglm3-6b-q8_0-ggml.bin

In [None]:
%%script bash
CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install -U chatglm-cpp #GPU

In [1]:
%%script bash --bg
MODEL=./chatglm3-6b-q4_0-ggml.bin uvicorn chatglm_cpp.langchain_api:app --host 127.0.0.1 --port 8000 &

In [2]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import ChatGLM
import pdfplumber
from PyPDF2 import PdfWriter, PdfReader

In [4]:
%%time
template = "你是一个翻译助手,请将下面的{source_language}翻译成{target_language}: \n{text}"
translate_prompt = PromptTemplate(input_variables=["source_language", "target_language", "text"], template=template)

llm = ChatGLM(endpoint_url="http://127.0.0.1:8000")
# llm.predict("你好")

translated_chain = LLMChain(llm=llm, prompt=translate_prompt)
translated_text = translated_chain.run({
    "text": "1.董事会、监事会及董事、监事、高级管理人员保证季度报告的真实、准确、完整，不存在虚假记载、误导性陈述或重大遗漏，并承担个别和连带的法律责任。",
    "source_language": "中文",
    "target_language": "英文",
    })
translated_text

CPU times: user 95 ms, sys: 4.5 ms, total: 99.5 ms
Wall time: 33.6 s


'The board of directors, the board of supervisors, and the directors, supervisors, and senior management personnel guarantee the authenticity, accuracy, and completeness of the quarterly reports, and they shall not contain any false statements, misleading statements, or significant omissions, and they shall bear individual and joint legal liabilities.'

In [None]:
with pdfplumber.open("data/pdf/迈瑞医疗：2023年三季度报告.PDF") as pdf:
    # 获取PDF文件的页面数量
    num_pages = len(pdf.pages)
    # 获取页面对象
    page = pdf.pages[0]
    # 提取页面的文本内容
    text = page.extract_text()
    table = page.extract_table()
    images = page.images
    print(text)

In [23]:
from io import BytesIO
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfgen import canvas


def replace_text_in_pdf(input_pdf, output_pdf, replacements):
    memory_file = BytesIO()
    c = canvas.Canvas(memory_file)
    pages = list(extract_pages(input_pdf))
    for page_layout in pages:
        # 获取页面大小
        width, height = page_layout.width, page_layout.height
        
        # 绘制原始页面内容
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):
                print(type(element), element)
                for line in element._objs:
                    print(type(line), dir(line), line)
                    if hasattr(line, 'get_text'):
                        font = element.fontname
                        size = line.size
                        text = line.get_text()
                        x, y = line.bbox[0], height - line.bbox[3]
                        c.setFont(font, size)
                        c.drawString(x, y, text)
        break

#         # 在相同位置插入替换的文本
#         for replacement in replacements.get(page_layout.pageid, []):
#             font = replacement['fontname']
#             size = replacement['size']
#             text = replacement['text']
#             x, y = replacement['x'], replacement['y']
#             c.setFont(font, size)
#             c.drawRightString(x, height - y, text)
#         c.showPage()
#     c.save()

#     # 将生成的PDF写入文件
#     with open(output_pdf, 'wb') as f:
#         f.write(memory_file.getvalue())


# 示例用法
input_pdf = 'data/pdf/迈瑞医疗：2023年三季度报告.PDF'
output_pdf = 'data/pdf/迈瑞医疗：2023年三季度报告222.PDF'
replacements = {
    1: [{'x': 200, 'y': 300, 'fontname': 'Helvetica', 'size': 12, 'text': '替换后的文本'}]
}

replace_text_in_pdf(input_pdf, output_pdf, replacements)


<class 'pdfminer.layout.LTTextBoxHorizontal'> <LTTextBoxHorizontal(0) 57.075,754.078,533.170,766.978 '证券代码：300760                              证券简称：迈瑞医疗                        公告编号：2023-052 \n'>


In [28]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

def replace_text_in_pdf(input_pdf, output_pdf, replacements):
    # 打开输入PDF文件
    with open(input_pdf, 'rb') as file:
        parser = PDFParser(file)
        document = PDFDocument(parser)

        # 创建PDF资源管理器和设备对象
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 打开输出PDF文件
        with open(output_pdf, 'wb') as output:
            # 遍历每一页
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                layout = device.get_result()

                # 遍历每个元素并替换文本
                for element in layout:
                    if isinstance(element, LTTextBoxHorizontal):
                        for old_text, new_text in replacements.items():
                            element._objs = [obj if obj.get_text() != old_text else obj.replace_with(new_text) for obj in element._objs]

                # 将页面添加到输出PDF文件
                output.write(device.get_result().to_xml())
                
replacements = {
    '旧文本1': '新文本1',
    '旧文本2': '新文本2',
}

input_pdf = 'data/pdf/迈瑞医疗：2023年三季度报告.PDF'
output_pdf = 'data/pdf/迈瑞医疗：2023年三季度报告222.PDF'
replace_text_in_pdf(input_pdf, output_pdf, replacements)

AttributeError: 'LTPage' object has no attribute 'to_xml'

In [2]:
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from io import StringIO
from reportlab.pdfgen import canvas

def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    return_string = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(resource_manager, return_string, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(resource_manager, device)

    with open(pdf_path, 'rb') as file:
        for page in PDFPage.get_pages(file):
            interpreter.process_page(page)

    text = return_string.getvalue()
    device.close()
    return_string.close()

    return text

def generate_pdf(text, output_path):
    c = canvas.Canvas(output_path)
    lines = text.split('\n')
    y = 750  # 设置初始的y坐标
    for line in lines:
        c.drawString(50, y, line)  # 在指定位置绘制文本
        y -= 15  # 调整y坐标，使下一行文本向上移动
    c.save()


pdf_path = 'data/pdf/迈瑞医疗：2023年三季度报告.PDF'
output_path = 'data/pdf/迈瑞医疗：2023年三季度报告2222.PDF'
text = extract_text_from_pdf(pdf_path)
generate_pdf(text, output_path)