In [1]:
import json

from bs4 import BeautifulSoup
from tqdm import tqdm
from textrank4zh import TextRank4Sentence

In [2]:
origin_test_path = "2021_2_data/doc_quality_data_test.json"
origin_train_path = "2021_2_data/doc_quality_data_train.json"

processed_test_path = "2021_2_data/processed_test.json"
processed_train_path = "2021_2_data/processed_train.json"

In [3]:
def remove_upprintable_chars(s):
    """移除所有不可见字符"""
    return ''.join(x for x in s if x.isprintable() or x=='\n')

In [4]:
def get_paragraphs_list(body):
    parser_body_list_ = [remove_upprintable_chars(s) for s in BeautifulSoup(body, 'html.parser')._all_strings()]
    parser_body_str = "".join(parser_body_list_)
    parser_body_list = []
    for ss in parser_body_str.split('\n'):
        ss = ss.strip()
        if len(ss) > 0 and \
        not (len(ss)<30 and ss[0] in ['（', '('] and ss[-1] in ['）', ')']) and \
        not (len(ss)<50 and (ss.startswith("资料图") or ss.startswith("图"))):
            parser_body_list.append(ss)
    return parser_body_list, len(parser_body_list_)

In [5]:
def get_remove_title_paragraphs_list(body, title):
    parser_body_list, body_len = get_paragraphs_list(body)
    end_index = 1
    # 大于1说明有html标签
    if body_len > 1:
        end_index = 2
    paragraphs_list = []
    # 检查 标题是否出现在前两个元素中 (有可能存在标签<p class=\"ori_titlesource\">,会有"原标题: title"的情况出现)
    for sentence in parser_body_list[:end_index]:
        for _ in range(2):
            title_index = sentence.find(title)
            if title_index > -1 and title_index < 2 * len(title):
                sentence = sentence[title_index + len(title) :]
        sentence = sentence.strip()
        if len(sentence) > 0:
            paragraphs_list.append(sentence)
    for sentence in parser_body_list[end_index:]:
        sentence = sentence.strip()
        if len(sentence) > 0:
            paragraphs_list.append(sentence)
    return paragraphs_list

In [6]:
def get_source(paragraphs_list):
    tmp = paragraphs_list[:2]
    if len(tmp) > 2:
        tmp.append(paragraphs_list[-1])
    char_num = 6
    source = ""
    for sentence in tmp:
        # print(f"get_source:{sentence}")
        source_index = sentence[:char_num].find("社")
        if source_index > -1:
            source = sentence[:source_index+1]
            break
        source_index = sentence[:char_num].find("网")
        if source_index > -1:
            source = sentence[:source_index+1]
            break
        source_index = sentence[:char_num].find("报")
        if source_index > -1:
            source = sentence[:source_index+1]
            break
        source_index = sentence[:char_num].find("刊")
        if source_index > -1:
            source = sentence[:source_index+1]
            break
        source_index = sentence[:char_num].find("讯")
        if source_index > -1:
            source = sentence[:source_index]
            break
            
    if source.startswith('文/'):
        source = source[2:]
    if source.startswith('来源：'):
        source = source[3:]
    if source.startswith('图源：'):
        source = source[3:]
        
    index = source.find("，")
    if index > -1:
        source = source[index+1:]
    index = source.find("。")
    if index > -1:
        source = source[index+1:]
    index = source.find("：")
    if index > -1:
        source = source[index+1:]
    index = source.find("据")
    if index > -1:
        source = source[index+1:]
    index = source.find("从")
    if index > -1:
        source = source[index+1:]
        
    source = remove_upprintable_chars(source)
    
    source = source.replace("】", "")
    source = source.replace("【", "")
    source = source.replace("]", "")
    source = source.replace("[", "")
    source = source.replace("（", "")
    source = source.replace("）", "")
    source = source.replace("(", "")
    source = source.replace(")", "")
    source = source.replace("◎", "")
    source = source.replace("■", "")
    source = source.replace("□", "")
    source = source.replace("。", "")
    source = source.replace("*", "")
    source = source.replace("◆", "")
    source = source.replace("@", "")
    source = source.replace("《", "")
    source = source.replace("》", "")
    source = source.replace("<", "")
    source = source.replace(">", "")
    source = source.replace("“", "")
    source = source.replace("”", "")
    source = source.replace("\"", "")
    source = source.replace("#", "")
    source = source.replace("△", "")
    source = source.replace("近日", "")
    
    source = source.strip()
    
    if len(source) <=1:
        source = ""
        
    if source in ['对于篮网', '有网', '本社', '本网', '本报', '本刊', '当今社', '不少网', '该报', '人社', '很多网', '在互联网', '北极星售电网', '美国媒体报', '此前报', '海报', '台湾媒体报',
                  '最近有网', '图片来自网', '现在网', '据港媒报', '天猫目前报', '从网', '国外媒体报', '随着现在社', '具体预报', '在现代社', '中国政府网', '通报', '针对网', '现代社', '核心提示：报',
                  '据报']:
        source = ""
        
    return source

In [7]:
def get_key_sentences(text, max_len=500):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source = 'all_filters')

    key_sentences = ""
    for item in tr4s.get_key_sentences(num=9999):
        # print(item.index, item.weight, item.sentence)  # index是语句在文本中位置，weight是权重
        key_sentences += item.sentence + "。"
        if len(key_sentences) > 500:
            break
    return key_sentences

In [8]:
def get_line_count(input_file_path):
    count = 0
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            count += 1
    return count

In [9]:
def process_origin_file(origin_path, processed_path):
    origin_path_line_count = get_line_count(origin_path)
    with open(origin_path, 'r', encoding='utf-8') as origin_file, open(processed_path, 'w', encoding='utf-8') as processed_file:
        for line in tqdm(origin_file, total=origin_path_line_count):
            json_data = json.loads(line)

            title = remove_upprintable_chars(json_data['title'].strip())
            paragraphs_list = get_remove_title_paragraphs_list(json_data['body'], title)
            paragraphs_str = "\n".join(paragraphs_list)
            paragraphs_num = len(paragraphs_list)
            pic_num = json_data['body'].count('<img ')
            source = get_source(paragraphs_list)
            key_sentences = get_key_sentences(paragraphs_str)

            json_data['process_title'] = title
            json_data['process_body'] = paragraphs_str
            json_data['paragraphs_num'] = paragraphs_num
            json_data['pic_num'] = pic_num
            json_data['source'] = source
            json_data['key_sentences'] = key_sentences

            processed_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")
            # break

In [10]:
process_origin_file(origin_test_path, processed_test_path)
process_origin_file(origin_train_path, processed_train_path)

  0%|          | 0/45285 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.191 seconds.
Prefix dict has been built successfully.
  0%|          | 0/45285 [00:01<?, ?it/s]
  0%|          | 0/576454 [00:00<?, ?it/s]
