# 制作评估语料

注意输入的 Context tokens 长度要小于最大模型最大生成序列长度的一半！

## 代码准备

### Imporings

In [1]:
import json
import math
import os
import sys
import random
from contextlib import ExitStack
from datetime import timedelta
from functools import partial
from glob import glob, iglob
from itertools import chain, cycle, islice, count
from multiprocessing import Pool
from time import time

import numpy as np
import sentencepiece as spm
from tqdm.auto import tqdm


### Constants

In [2]:
SEQ_LENGTH = 1024
MIN_CTX_LEN = 128

SPM_MODEL = '../data/spm/gpt2_huamei_corpus_bpe_32k_v2.model'

SP = spm.SentencePieceProcessor()
SP.load(SPM_MODEL)

True

### Functions

In [3]:
def text_files_line_iterator(paths):
    return chain.from_iterable(
        open(path)
        for path
        in tqdm(paths, '[iter files]', unit='file')
    )


def single_text_file_line_count(path, show_progress_bar=False):
    with open(path) as fd:
        iterable = tqdm(fd) if show_progress_bar else fd
        return sum(1 for _ in iterable)
        

def text_files_line_count(paths):
    try:
        total = len(paths)
    except (AttributeError, TypeError):
        total = None
    with Pool() as pool:
        it = pool.imap_unordered(
            single_text_file_line_count,
            tqdm(paths, '[map: files]', unit='file')
        )
        return sum(c for c in tqdm(it, '[reduce: sum lines]', unit='file', total=total))


def proc_line(line):
    result = []
    line = line.strip()
    if not line:
        return result
    paragraphs = json.loads(line)
    text = ''
    n_text = 0
    for sentence in chain.from_iterable(paragraphs):
        sentence = sentence.strip()
        if not sentence:
            continue
        n_sentence = len(SP.encode_as_ids(sentence))
        if n_text + n_sentence < SEQ_LENGTH + MIN_CTX_LEN // 2:
            text += sentence
            n_text += n_sentence
        else:
            result.append({'text': text, 'length': n_text})
            text = sentence
            n_text = n_sentence
    if n_text:
        result.append({'text': text, 'length': n_text})
    return result
    

## 语料文件

### 输入文件

### 列出输入文件

In [1]:
INPUT_FILES = [
    path
    for path in tqdm(iglob(
        "/nfs/server01_public/豆瓣/情感相关的小组/data.json/*",
        recursive=True
    ))
    if os.path.isfile(path) and os.path.splitext(path)[1].lower() in ('json', '.jsonl', '.jsonlines', 'json', 'jsonline')
]


print(f'源语料文件数：{len(INPUT_FILES):,d}')

NameError: name 'tqdm' is not defined

## 文件采样

由于只是用于评估，所以只使用很少的文件

In [5]:
K = 5

print(f'选取 {K} 个输入语料文件')

SRC_FILES = sorted(random.choices(INPUT_FILES, k=5))

SRC_FILES

选取 5 个输入语料文件


['/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_0(3).jsonl',
 '/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_0(33).jsonl',
 '/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_0(53).jsonl',
 '/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_1(12).jsonl',
 '/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_1(2).jsonl']

### 文件修复

由于格式错误，需要修复！修复后的文件保存到来源文件相同的目录

In [7]:
FIXED_FILES = [fn + '.fix' for fn in SRC_FILES]


def fix_json(args):
    src_fn, dst_fn = args
    with open(src_fn) as src_fp, open(dst_fn, 'w') as dst_fp:
        for line in src_fp:
            line = line.strip()
            if not line: continue
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                data=eval(line)
            text = json.dumps(data, ensure_ascii=False)
            print(text, file=dst_fp)

with Pool() as pool:
    it = pool.imap_unordered(
        fix_json,
        tqdm(zip(SRC_FILES, FIXED_FILES), 'map', total=len(SRC_FILES)),
    )
    for _ in tqdm(it, 'wait', total=len(SRC_FILES)):
        pass


print(f'修复后的文件：{FIXED_FILES}')

HBox(children=(IntProgress(value=0, description='map', max=5, style=ProgressStyle(description_width='initial')…




HBox(children=(IntProgress(value=0, description='wait', max=5, style=ProgressStyle(description_width='initial'…


修复后的文件：['/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_0(3).jsonl.fix', '/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_0(33).jsonl.fix', '/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_0(53).jsonl.fix', '/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_1(12).jsonl.fix', '/nfs/server01_public/豆瓣/情感相关的小组/data.json/情感相关的小组-话题_url_1(2).jsonl.fix']


### 统计输入文件总行数

In [8]:
%%time

total_lines = text_files_line_count(FIXED_FILES)
print(f'源语料行数：{total_lines:,d}')

HBox(children=(IntProgress(value=0, description='[map: files]', max=5, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='[reduce: sum lines]', max=5, style=ProgressStyle(description_…



源语料行数：7,788
CPU times: user 73.5 ms, sys: 38 ms, total: 112 ms
Wall time: 160 ms


## 处理

我们目前的评估目标

1. 进行人工评估，输出列表文件进行比对
1. 输入文字为回答数据的逗号前的半句

### 处理函数定义

In [12]:



def proc_line(line):
    result = []
    line = line.strip()
    if not line:
        return result
    #
    

In [110]:
import re

from bs4 import BeautifulSoup


def ptb_detokenizer(string):
	string = string.replace(" '", "'")
	string = string.replace(" \n", "\n")
	string = string.replace("\n ", "\n")
	string = string.replace(" n't", "n't")
	string = string.replace(" N ","1 ")
	string = string.replace("$ 1", "$1")
	string = string.replace("# 1", "#1")
	return string


def wikitext_detokenizer(string):
	#contractions
	string = string.replace("s '", "s'")
	string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
	# number separators
	string = string.replace(" @-@ ", "-")
	string = string.replace(" @,@ ", ",")
	string = string.replace(" @.@ ", ".")
	#punctuation
	string = string.replace(" : ", ": ")
	string = string.replace(" ; ", "; ")
	string = string.replace(" . ", ". ")
	string = string.replace(" ! ", "! ")
	string = string.replace(" ? ", "? ")
	string = string.replace(" , ", ", ")
	# double brackets
	string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
	string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
	string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
	string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
	string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
	# miscellaneous
	string = string.replace("= = = =", "====")
	string = string.replace("= = =", "===")
	string = string.replace("= =", "==")
	string = string.replace(" "+chr(176)+" ", chr(176))
	string = string.replace(" \n", "\n")
	string = string.replace("\n ", "\n")
	string = string.replace(" N ", " 1 ")
	string = string.replace(" 's", "'s")

	return string


CJK_WHITESPACE_REGEX = re.compile(r'(?P<c>[\u2E80-\u9FFF])(\s+)')
def remove_cjk_whitespace(s):  # type: (str)->str
    return re.sub(CJK_WHITESPACE_REGEX, r'\g<c>', s.strip())


REPLACEMENT_CHARACTER_REGEX = re.compile(r'\uFFFD')
def replace_character(s, repl):  # type: (str)->str
    return re.sub(REPLACEMENT_CHARACTER_REGEX, repl, s)


REPLACEMENT_CHARACTER = chr(0XFFFD)

def normalize(s):
    s = s.strip()
    # remove html tags
    s = BeautifulSoup(s).get_text(os.linesep)
    # 消除空行，空的空格
    _s = ''
    for sn in s.split():
        sn = sn.strip()
        if not sn: continue
        if sn 
    s = _s
    # 消除中文之间的空格
    s = remove_cjk_whitespace(s)
    #
#     s = ptb_detokenizer(s)
#     s = wikitext_detokenizer(s)
    return s

In [92]:
s = "'2010-05-25 09:28:17 安米(心有阳光�他们喜欢"
REPLACEMENT_CHARACTER_REGEX = re.compile(r'\uFFFD')
re.sub(REPLACEMENT_CHARACTER_REGEX, r'\n', s)

"'2010-05-25 09:28:17 安米(心有阳光\n他们喜欢"

In [111]:
s = normalize(' 2010-05-25 09:28:17 安米 (心有阳光，自暖人) 他们喜欢突然冷下来，让你措手不及<br>======<br>是的！<br>前一秒可以跟你疯的要死。下一秒直接转头不认人<br>')
s




In [80]:
[d['anwser'] for d in data['anwsers']]

['他们喜欢突然冷下来，让你措手不及',
 '今天双鱼跟我分手，因为年龄的原因，他的父母无法接受，他短期说服不了。。。<br>他能等，我不能等。。。。于是害怕耽误我。。。。<br>怎么办<br>',
 '你给鱼一分的好，他会还你十分的爱，但不鱼很敏感，甚至自欺，不要让他感觉你冷落了他，你越粘他，越在乎他，让他感觉到',
 '我家的鱼腩是属于不喜欢你过多干涉他的 过多的关心他会烦<br>反而比较喜欢把心思花在我身上 逗我玩<br>提到他的事 就比较温吞和犹疑了 我关心几句他会很开心 多了他就会说知道了知道了 然后转移话题<br>我生气的时候也说过 相处不来就别相处了 是很冷静的说的 <br>然后他就会贱贱的说 不要嘛老婆 <br>有时候被我逼急了 他也会不开心 摔电话<br>但很快就会打来电话 百般求饶 <br>唉。。。。。。。他的好脾气是我的最爱']

In [11]:
with open(FIXED_FILES[0]) as fp:
    for line in fp:
        data = json.loads(line)
        break
data


{'topic_id': '10935952',
 'url': 'www.douban.com/group/topic/10935952/',
 'date': '2010-04-22 00:48:17',
 'title': '直播鱼之相处心理分析',
 'text': '"这里设定三种。一鱼和你互为真爱，你们只计较过去，担心未来。二你爱鱼很多，它爱你一般。三鱼爱你很多，你烦。（第三种就不用分析了吧）<br>我是第一种。起码现阶段是。以后只可能有第二种趋势。烦劳众夜光鱼不吝赐教。共商鱼际。什么偏题的水楼咱们尽量少好哇。<br><br>我来提第一个。<br>近日进入和鱼的深入了解阶段。什么叫深入了解，就是不停留在纯粹的思想交流上，互相表达爱慕之情（我更多些）上。已经开始打情骂俏，晚上希望能腻歪几句的地步。可是！我鱼说：“守着我”。这种彻头彻尾的索取鬼越亲我越担心他会一得不到足够的爱感就劈腿。我家鱼不比在座的各位逊色在暧昧这方面<br>所以：我的担心是对的不？我该怎么做？我很爱很爱鱼。想和他一辈子。',
 'author': '夏澈澈',
 'anwsers': [{'anwserer': '"杰米"',
   'likes': 0,
   'anwsertime': '"2010-05-25 09:28:17',
   'anwser': '他们喜欢突然冷下来，让你措手不及',
   'directtext': ''},
  {'anwserer': '"多啦ZZZZ梦"',
   'likes': 0,
   'anwsertime': '"2010-05-26 19:38:48',
   'directtext': ''},
  {'anwserer': '"monicpan"',
   'likes': 0,
   'anwsertime': '"2010-05-26 20:36:56',
   'anwser': '今天双鱼跟我分手，因为年龄的原因，他的父母无法接受，他短期说服不了。。。<br>他能等，我不能等。。。。于是害怕耽误我。。。。<br>怎么办<br>',
   'directtext': ''},
  {'anwserer': '"有个外号叫小白"',
   'likes': 0,
   'anwsertime': '"2010-05-26 2

### 输出文件

In [6]:
OUTPUT_FILE = '../data/gpt2_huamei_corpus_emotion.jsonl'