In [None]:
!pip install PyPDF2

In [None]:
!pip install pdfplumber

In [26]:
import pdfplumber
from PyPDF2 import PdfReader


class DataProcess(object):

    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.data = []

    # 滑动窗口功能实现，其中fast代表当前遍历句子的index，slow代表每次窗口开始滑动的起点。默认窗口直接滑动的overlap是1个句子。
    def SlidingWindow(self, sentences, kernel = 512, stride = 1):
        sz = len(sentences)
        cur = ""
        fast = 0
        slow = 0
        while(fast < len(sentences)):
            sentence = sentences[fast]
            if(len(cur + sentence) > kernel and (cur + sentence) not in self.data):
                self.data.append(cur + sentence + "。")
                cur = cur[len(sentences[slow] + "。"):]
                slow = slow + 1
            cur = cur + sentence + "。"
            fast = fast + 1

    #  数据过滤，根据当前的文档内容的item划分句子，然后根据max_seq划分文档块。
    def Datafilter(self, line, header, pageid, max_seq = 1024):

         sz = len(line)
         if(sz < 6):
             return

         if(sz > max_seq):

             if("■" in line):
                 sentences = line.split("■")
             elif("•" in line):
                 sentences = line.split("•")
             elif("\t" in line):
                 sentences = line.split("\t")
             else:
                 sentences = line.split("。")

             for subsentence in sentences:
                 subsentence = subsentence.replace("\n", "")

                 if(len(subsentence) < max_seq and len(subsentence) > 5):
                     subsentence = subsentence.replace(",", "").replace("\n","").replace("\t","")
                     if(subsentence not in self.data):
                         self.data.append(subsentence)
         else:
             line = line.replace("\n","").replace(",", "").replace("\t","")
             if(line not in self.data):
                 self.data.append(line)

    # 提取页头即一级标题
    def GetHeader(self, page):
        try:
            lines = page.extract_words()[::]
        except:
            return None
        if(len(lines) > 0):
            for line in lines:
                if("目录" in line["text"] or ".........." in line["text"]):
                    return None
                if(line["top"] < 20 and line["top"] > 17):
                    return line["text"]
            return lines[0]["text"]
        return None

    # 按照每页中块提取内容,并和一级标题进行组合,配合Document 可进行意图识别
    def ParseBlock(self, max_seq = 1024):

        with pdfplumber.open(self.pdf_path) as pdf:

            for i, p in enumerate(pdf.pages):
                header = self.GetHeader(p)

                if(header == None):
                    continue

                texts = p.extract_words(use_text_flow=True, extra_attrs = ["size"])[::]

                squence = ""
                lastsize = 0

                for idx, line in enumerate(texts):
                    if(idx <1):
                        continue
                    if(idx == 1):
                        if(line["text"].isdigit()):
                            continue
                    cursize = line["size"]
                    text = line["text"]

                    if(text == "□" or text == "•"):
                        continue
                    elif(text== "警告！" or text == "注意！" or text == "说明！"):
                        if(len(squence) > 0):
                            self.Datafilter(squence, header, i, max_seq = max_seq)
                        squence = ""
                    elif(format(lastsize,".5f") == format(cursize,".5f")):
                        if(len(squence)>0):
                            squence = squence + text
                        else:
                            squence = text
                    else:
                        lastsize = cursize
                        if(len(squence) < 15 and len(squence)>0):
                            squence = squence + text
                        else:
                            if(len(squence) > 0):
                                self.Datafilter(squence, header, i, max_seq = max_seq)
                            squence = text
                if(len(squence) > 0):
                    self.Datafilter(squence, header, i, max_seq = max_seq)

    # 按句号划分文档，然后利用最大长度划分文档块
    def ParseOnePageWithRule(self, max_seq = 512, min_len = 6):
        for idx, page in enumerate(PdfReader(self.pdf_path).pages):
            page_content = ""
            text = page.extract_text()
            words = text.split("\n")
            for idx, word in enumerate(words):
                text = word.strip().strip("\n")
                if("...................." in text or "目录" in text):
                    continue
                if(len(text) < 1):
                    continue
                if(text.isdigit()):
                    continue
                page_content = page_content + text
            if(len(page_content) < min_len):
                continue
            if(len(page_content) < max_seq):
                if(page_content not in self.data):
                    self.data.append(page_content)
            else:
                sentences = page_content.split("。")
                cur = ""
                for idx, sentence in enumerate(sentences):
                    if(len(cur + sentence) > max_seq and (cur + sentence) not in self.data):
                        self.data.append(cur + sentence)
                        cur = sentence
                    else:
                        cur = cur + sentence
    #  滑窗法提取段落
    #  1. 把pdf看做一个整体,作为一个字符串
    #  2. 利用句号当做分隔符,切分成一个数组
    #  3. 利用滑窗法对数组进行滑动, 此处的
    def ParseAllPage(self, max_seq = 512, min_len = 6):
        all_content = ""
        for idx, page in enumerate(PdfReader(self.pdf_path).pages):
            page_content = ""
            text = page.extract_text()
            words = text.split("\n")
            for idx, word in enumerate(words):
                text = word.strip().strip("\n")
                if("...................." in text or "目录" in text):
                    continue
                if(len(text) < 1):
                    continue
                if(text.isdigit()):
                    continue
                page_content = page_content + text
            if(len(page_content) < min_len):
                continue
            all_content = all_content + page_content
        sentences = all_content.split("。")
        self.SlidingWindow(sentences, kernel = max_seq)

In [None]:
def remove_urls(text):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    # 使用 re.sub() 函数替换所有匹配的 URL 为 ""
    text_without_urls = re.sub(url_pattern, '', text)
    specific_text_pattern = re.compile(r'扫描下方二维码关注公众号|提取码|关注|科学上网|回复关键词|侵权|版权|致谢|引用|LICENSE'
                                   r'|组队打卡|任务打卡|组队学习的那些事|学习周期|开源内容|打卡|组队学习|链接')
    text_without_urls = re.sub(specific_text_pattern, '', text)
    return text_without_urls
dp =  DataProcess(pdf_path = "train_a.pdf")
dp.ParseBlock(max_seq = 1024)
# dp.ParseBlock(max_seq = 512)
print(dp.data)

In [None]:
def remove_urls(text):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    # 使用 re.sub() 函数替换所有匹配的 URL 为 ""
    text_without_urls = re.sub(url_pattern, '', text)

    return text_without_urls
def SlidingWindow(sentences, kernel = 512, stride = 1):
    data = []
    sz = len(sentences)
    cur = ""
    fast = 0
    slow = 0
    while(fast < len(sentences)):
        sentence = sentences[fast]
        if(len(cur + sentence) > kernel and (cur + sentence) not in data):
            data.append(remove_urls(cur + sentence + "。"))
            cur = cur[len(sentences[slow] + "。"):]
            slow = slow + 1
        cur = cur + sentence + "。"
        fast = fast + 1
    return data
all_content = ""
for idx, page in enumerate(PdfReader("train_a.pdf").pages):
    page_content = ""
    text = page.extract_text()
    words = text.split("\n")
    for idx, word in enumerate(words):
        text = word.strip().strip("\n")
        
        if("...................." in text or "目录" in text):
            continue
        if(len(text) < 1):
            continue
        if(text.isdigit()):
            continue
        page_content = page_content + text
    if(len(page_content) < 6):
        continue
    all_content = all_content + page_content
sentences = all_content.split("。")
SlidingWindow(sentences, kernel = 512)

In [51]:
import re

# 示例文本
text = "这是一个包含 URL 的示例：http://example.com 和 https://another-example.org。这是不删除的示例,扫描下方二维码关注公众号"

specific_text_pattern = re.compile(r'扫描下方二维码关注公众号|提取码|关注|科学上网|回复关键词|侵权|版权|致谢|引用|LICENSE'
                                   r'|组队打卡|任务打卡|组队学习的那些事|学习周期|开源内容|打卡|组队学习|链接')
# 替换所有匹配的特定文本为空字符串
re.sub(specific_text_pattern, '', text)

'这是一个包含 URL 的示例：http://example.com 和 https://another-example.org。这是不删除的示例,'