#### 一、安装依赖组件

In [None]:
# 安装 dotenv
!pip install python-dotenv
# 安装 pdf 解析库
!pip install pdfminer.six
# 安装 elasticsearch 客户端
!pip install elasticsearch
### 安装 NLTK（文本处理方法库）
!pip install nltk
# 安装工具库
!pip install helpers

#### 二、示例代码

In [None]:
# 导入依赖库
from dotenv import load_dotenv, find_dotenv
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from elasticsearch import Elasticsearch
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import helpers
import re
import os, time
import warnings



In [None]:
# 加载 .env 文件中定义的环境变量
_ = load_dotenv(find_dotenv())

# 引入配置文件
ELASTICSEARCH_BASE_URL = os.getenv('ELASTICSEARCH_BASE_URL')
ELASTICSEARCH_USERNAME = os.getenv('ELASTICSEARCH_USERNAME')
ELASTICSEARCH_PASSWORD = os.getenv('ELASTICSEARCH_PASSWORD')

In [None]:
# 定义函数提取 PDF 文本
def extract_text_from_pdf(filename, page_numbers=None, min_line_length=1):
    '''从 PDF 文件中（按指定页码）提取文字'''
    paragraphs = []
    buffer = ''
    full_text = ''
    # 提取全部文本
    for i, page_layout in enumerate(extract_pages(filename)):
        # 如果指定了页码范围，跳过范围外的页
        if page_numbers is not None and i not in page_numbers:
            continue
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                full_text += element.get_text() + '\n'
    # 按空行分隔，将文本重新组织成段落
    lines = full_text.split('\n')
    for text in lines:
        if len(text) >= min_line_length:
            buffer += (' '+text) if not text.endswith('-') else text.strip('-')
        elif buffer:
            paragraphs.append(buffer)
            buffer = ''
    if buffer:
        paragraphs.append(buffer)
    return paragraphs

In [None]:

paragraphs = extract_text_from_pdf("../llama2.pdf", min_line_length=10)

for para in paragraphs[:4]:
    print(para+"\n")