# 制作评估语料

注意输入的 Context tokens 长度要小于最大模型最大生成序列长度的一半！

这个 notebook 针对的输入 corpus 文件是：

- 每个文档一行
- 文档包含段落数组
- 段落包含句子数组
- 从逗号中间截断，得出输入文本

的 Loose JSON 文件

## 代码准备

### Imporings

In [None]:
import csv
import json
import math
import os
import sys
import re
import random
from contextlib import ExitStack
from datetime import timedelta
from functools import partial
from glob import glob, iglob
from itertools import chain, cycle, islice, count, accumulate, compress, repeat
from multiprocessing import Pool
from time import time

import numpy as np
import sentencepiece as spm
from ftfy import fix_text
from tqdm.auto import tqdm


### Constants

In [None]:
MAX_LEN = 128
MIN_LEN = 16

SPM_MODEL = '../data/spm/gpt2_huamei_corpus_bpe_32k_v2.model'

SP = spm.SentencePieceProcessor()
SP.load(SPM_MODEL)

### Functions

In [None]:
def text_files_line_iterator(paths):
    return chain.from_iterable(
        open(path)
        for path
        in tqdm(paths, '[iter files]', unit='file')
    )


def single_text_file_line_count(path, show_progress_bar=False):
    with open(path) as fd:
        iterable = tqdm(fd) if show_progress_bar else fd
        return sum(1 for _ in iterable)
        

def text_files_line_count(paths):
    try:
        total = len(paths)
    except (AttributeError, TypeError):
        total = None
    with Pool() as pool:
        it = pool.imap_unordered(
            single_text_file_line_count,
            tqdm(paths, '[map: files]', unit='file')
        )
        return sum(c for c in tqdm(it, '[reduce: sum lines]', unit='file', total=total))


## 语料文件

### 列出输入文件

In [None]:
# INPUT_FILES = [
#     path
#     for path in tqdm(iglob(
#         "/nfs/server01_public/豆瓣/情感相关的小组/data.json/*",
#         recursive=True
#     ))
#     if os.path.isfile(path) and os.path.splitext(path)[1].lower() in ('json', '.jsonl', '.jsonlines', 'json', 'jsonline')
# ]

INPUT_FILES = [
    '/nfs/server01_public/data/gpt2/output/xinli_20191016.jsonl'
]

print(f'源语料文件数：{len(INPUT_FILES):,d}')

### 统计输入文件总行数

In [None]:
%%time

total_lines = text_files_line_count(INPUT_FILES)
print(f'源语料行数：{total_lines:,d}')

## 处理

我们目前的评估目标

1. 进行人工评估，输出列表文件进行比对
1. 输入文字为回答数据的逗号前的半句

### 处理函数定义

从段落中拆出开头的几句作为预测上下文

In [None]:
RE_SPLIT_PARAGRAPH = re.compile(
    r'(?<=[,:,])\s*'
)

def split_text(s):
    return re.split(RE_SPLIT_PARAGRAPH, s)


def proc_line(line):
    result = []
    line = line.strip()
    if not line:
        return result
    min_length = MIN_LEN
    max_length = MAX_LEN
    paragraphs = json.loads(line)
    for sentences in paragraphs:
        paragraph_text = ''.join(s.strip() for s in sentences)
        paragraph_text = fix_text(paragraph_text).strip()
        parts = split_text(paragraph_text)
        population = []
        weights = []
        for i, v in enumerate(accumulate(len(SP.encode_as_ids(s)) for s in parts[:-1])):
            if min_length < v <= max_length:
                population.append(i)
                weights.append(math.log2(v))
        if population:
            elems = random.choices(population, weights=weights)
            i = elems[0]
            d = {
                'text': ''.join(parts[:i+1]),
                'post_text': ''.join(parts[i+1:]),
            }
            result.append(d)
        
    return result


    

### 输出文件 tsv/json

In [None]:
OUTPUT_FILE = '../data/eval.json'

### 执行

并发执行

In [None]:

n_total_samples = 0

with Pool() as pool, open(OUTPUT_FILE, 'w') as fp:
    lines_iterator = text_files_line_iterator(INPUT_FILES)
    it = pool.imap_unordered(
        proc_line,
        tqdm(lines_iterator, '[map lines]', total=total_lines),
        chunksize=512
    )
    for result in tqdm(it, '[reduce all]', total=total_lines):
        for d in result:
            print(json.dumps(d, ensure_ascii=False), file=fp)
            n_total_samples += 1

print(f'得到语料样本数：{n_total_samples:,d}')

In [None]:
! wc -l {OUTPUT_FILE} && \
  du -h {OUTPUT_FILE}

## 采样

我们也许不需要这么多样本进行 evaluation，所以，进行采样

In [None]:
with open(OUTPUT_FILE) as fp:
    n_total_samples = sum(1 for _ in tqdm(fp))

print(f'样本数：{n_total_samples:,d}.')

设定需要的采样数：

In [None]:
n_samples = 1000

print(f'n_sample={n_samples:,d} sample_rate={n_samples/n_total_samples*100:.3}%')

计算随机采样 mask:

In [None]:
%%time

lines_mask = np.zeros(n_total_samples, dtype=bool)
lines_mask[:n_samples] = True
np.random.shuffle(lines_mask)

采样到新的文件

In [None]:
assert lines_mask.shape[0]==n_total_samples

root, ext = os.path.splitext(OUTPUT_FILE)
SPL_FILE = root + f'-spl_{n_sample}' + ext

with open(OUTPUT_FILE) as fp_src, open(SPL_FILE, 'w') as fp_dst:
    for s in tqdm(compress(fp_src, lines_mask), 'sampling', total=n_sample):
        print(s.strip(), file=fp_dst)

del lines_mask

In [None]:
! wc -l {SPL_FILE} && \
  du -h {SPL_FILE}