In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTImage

def extract_images_from_pdf(pdf_path):
    images = []
    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, LTImage):
                images.append({
                    "page_number": page_layout.pageid,
                    "bbox": element.bbox,
                    "name": element.name,
                })
    return images

pdf_path = "./RG-S6000E/chapter_配置指南-网管与监控.pdf"
images = extract_images_from_pdf(pdf_path)
for image in images:
    print(f"Page: {image['page_number']}, BBox: {image['bbox']}, Name: {image['name']}")


In [None]:
import fitz  # PyMuPDF
from PIL import Image
import io

def extract_images(pdf_path):
    # 打开PDF文件
    pdf_document = fitz.open(pdf_path)
    
    # 遍历PDF中的每一页
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        
        # 获取页面中的所有图片
        image_list = page.get_images(full=True)
        
        # 遍历每一个图片
        for img_index, img in enumerate(image_list):
            xref = img[0]  # 这是图片的引用
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image = Image.open(io.BytesIO(image_bytes))
            
            # 保存图片
            image_filename = f"image_{page_number+1}_{img_index+1}.{image_ext}"
            image.save(image_filename)
            print(f"图片保存为：{image_filename}")
            
            # 获取图片的位置
            img_rect = page.get_image_rects(xref)
            for rect in img_rect:
                print(f"图片在页面 {page_number+1} 的位置：{rect}")

# 调用函数提取图片
pdf_path = "./RG-S6000E/chapter_配置指南-网管与监控.pdf"
extract_images(pdf_path)


In [4]:
import fitz  # PyMuPDF

def compress_pdf(doc, output_path):
    # 保存压缩后的PDF
    doc.save(output_path, garbage=4, deflate=True)
    doc.close()

def split_pdf_by_chapters(pdf_path, chapter_keywords):
    doc = fitz.open(pdf_path)
    output_docs = []
    current_output_doc = None
    current_chapter = None

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()

        # 检查该页是否包含章节关键词
        for keyword in chapter_keywords:
            if keyword in text:
                # 如果当前有正在处理的章节，保存它并压缩
                if current_output_doc is not None:
                    output_docs.append((current_chapter, current_output_doc))
                    current_output_doc = fitz.open()
                    current_chapter = keyword
                else:
                    current_output_doc = fitz.open()
                    current_chapter = keyword
                break

        # 将当前页插入到当前章节的PDF文档中
        if current_output_doc is not None:
            current_output_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)

    # 保存最后一个章节
    if current_output_doc is not None:
        output_docs.append((current_chapter, current_output_doc))

    return output_docs

def save_output_docs(output_docs, output_prefix="chapter"):
    for chapter, output_doc in output_docs:
        if chapter is not None:
            output_filename = f"{output_prefix}_{chapter}.pdf"
        else:
            output_filename = f"{output_prefix}_unknown.pdf"
        compress_pdf(output_doc, output_filename)

pdf_path = "./RG-S6000E/20210910211119_RG-S6000E系列交换机RGOS 11.4(1)B12P32S1版本配置手册(V1.0).pdf"
# 根据 PDF 内容调整关键词
chapter_keywords = ["配置指南-系统配置", "配置指南-以太网交换", "配置指南-IP 地址及应用", "配置指南-IP 路由", \
                   "配置指南-组播","配置指南-MPLS","配置指南-ACL&QOS", "配置指南-可靠性", "配置指南-安全",\
                   "配置指南-网管与监控","配置指南-诊断命令"]
output_docs = split_pdf_by_chapters(pdf_path, chapter_keywords)
save_output_docs(output_docs,"./RG-S6000E/chapter")


In [None]:
output_docs

In [None]:
import pdfplumber
import markdown

In [None]:
def pdf_to_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def text_to_markdown(text):
    return markdown.markdown(text)

def save_markdown(md_text, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(md_text)

pdf_path = '/root/llm/Langchain-Chatchat-master/data/厂家产品手册/锐捷/20210910211119_RG-S6000E系列交换机RGOS 11.4(1)B12P32S1版本配置手册(V1.0).pdf'
md_output_path = './test.md'

# Extract text from PDF
text = pdf_to_text(pdf_path)

# Convert text to Markdown format
md_text = text_to_markdown(text)

# Save the Markdown file
save_markdown(md_text, md_output_path)

print(f"Markdown file saved to {md_output_path}")

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

# 生成模拟数据
# 假设我们有1000个样本
np.random.seed(0)  # 固定随机种子，保证每次生成的数据相同
n_samples = 1000
# 生成随机的真实标签和预测标签
true_labels = np.random.randint(0, 2, n_samples)
# 为了达到81.2%的准确率，我们需要有812个正确预测
n_correct = int(n_samples * 0.812)
n_incorrect = n_samples - n_correct

# 将真实标签复制一份作为预测标签
predicted_labels = true_labels.copy()
# 随机挑选188个样本（1000 - 812），将其预测标签反转，以达到81.2%的准确率
flip_indices = np.random.choice(n_samples, n_incorrect, replace=False)
predicted_labels[flip_indices] = 1 - predicted_labels[flip_indices]

# 计算准确率，召回率，精确率，F1-score
accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# 打印结果
print(f"准确率: {accuracy * 100:.1f}%")
print(f"召回率: {recall:.2f}")
print(f"精确率: {precision:.2f}")
print(f"F1-score: {f1:.2f}")


In [8]:
import os

def split_markdown_by_titles(file_path, titles):
    # 读取Markdown文件内容
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # 用于存储分割后的章节
    chapters = []
    current_chapter = []
    title_indices = []
    
    # 标记每个章节标题出现的行数位置
    for index, line in enumerate(lines):
        if line.strip() in titles:
            title_indices.append(index)

    # 添加一个终止索引
    title_indices.append(len(lines))

    # 按索引分割内容
    for i in range(len(title_indices) - 1):
        start_index = title_indices[i]
        end_index = title_indices[i + 1]
        chapters.append(lines[start_index:end_index])

    # 创建输出目录
    output_dir = "chapters"
    os.makedirs(output_dir, exist_ok=True)

    # 将每个章节内容写入单独的文件
    for i, chapter in enumerate(chapters):
        chapter_title = chapter[0].strip('#').strip().replace(' ', '_')
        output_file = os.path.join(output_dir, f"chapter_{i+1}_{chapter_title}.md")
        with open(output_file, 'w', encoding='utf-8') as file:
            file.writelines(chapter)
        print(f"Chapter {i+1} written to {output_file}")

# 使用示例
titles = ["# 配置指南-以太网交换", "### 12 LLDP"]
split_markdown_by_titles('./RG-S6000E/Output.md', titles)

# split_markdown_by_chapters('./RG-S6000E/Output.md')


Chapter 1 written to chapters/chapter_1_配置指南-以太网交换.md
Chapter 2 written to chapters/chapter_2_12_LLDP.md


In [1]:
print("#")

#
