# 文段句格式语料转wikipedia格式jsonlines

原始预料的格式是 JSON Lines，每一行是一篇文章，其内容是二维数组，第一个维度是段落，第二个维度是句子

## 代码准备

### Imporings

In [1]:
import json
import math
import os
import sys
from contextlib import ExitStack
from datetime import timedelta
from functools import partial
from glob import glob, iglob
from itertools import chain, cycle, islice
from multiprocessing import Pool
from time import time

import numpy as np
import sentencepiece as spm
import tensorflow as tf
from tqdm.auto import tqdm


### Constants

In [2]:

SEQ_LENGTH = 768

SP = spm.SentencePieceProcessor()
SP.load('../data/spm/gpt2_huamei_corpus_bpe_32k_v2.model')

True

### Functions

In [11]:
def text_files_line_iterator(paths):
    return chain.from_iterable(
        open(path, encoding='utf8')
        for path in tqdm(paths, '[files     ]', unit='file')
    )


def single_text_file_line_count(path, pbar=False):
    with open(path, encoding='utf8') as fd:
        if pbar:
            return sum(1 for _ in tqdm(fd))
        else:
            return sum(1 for _ in fd)


def text_files_line_count(paths):
    with Pool() as pool:
        it = pool.imap_unordered(
            single_text_file_line_count,
            tqdm(paths, '[files     ]', unit='file')
        )
        if hasattr(paths, '__len__'):
            total = len(paths)
        else:
            total = None
        return sum(c for c in tqdm(it, '[files     ]', unit='file', total=total))

In [4]:
def proc_line(line):
    line = line.strip()
    if not line:
        return
    result = []
    text = ''
    n_text = 0
    paragraphs = json.loads(line)
    for sentence in chain.from_iterable(paragraphs):
        sentence = sentence.strip()
        if not sentence:
            continue
        n_sentence = len(SP.encode_as_ids(sentence))
        if n_text + n_sentence > SEQ_LENGTH + SEQ_LENGTH // 64:
            result.append({'text': text, 'length': n_text})
            text = sentence
            n_text = n_sentence
        else:
            text += sentence
            n_text += n_sentence
    if n_text:
        result.append({'text': text, 'length': n_text})
    return result
    

## 语料文件

In [5]:
INPUT_FILES = [
    path
    for path in tqdm(iglob(
        "/nfs/server01_public/data/gpt2/output/gpt2_huamei_corpus.json.8g/**/*.*",
        recursive=True
    ))
    if os.path.isfile(path) and os.path.splitext(path)[1].lower() in ('.jsonl', '.jsonlines', 'json', 'jsonline')
]
print(f'源语料文件数：{len(INPUT_FILES):,d}')

total_lines = text_files_line_count(INPUT_FILES)
print(f'源语料行数：{total_lines:,d}')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


源语料文件数：1,734


HBox(children=(IntProgress(value=0, description='[files     ]', max=1734, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='[files     ]', max=1734, style=ProgressStyle(description_widt…



源语料行数：1,791,185


In [6]:
OUTPUT_FILE = '../data/gpt2_huamei_corpus_8g.jsonl'

## 执行

In [7]:
lines_iterator = text_files_line_iterator(INPUT_FILES)

with Pool() as pool, \
     open(OUTPUT_FILE, 'w') as fp:
    it = pool.imap_unordered(
        proc_line,
        tqdm(lines_iterator, 'mapping', total=total_lines),
        chunksize=512
    )
    for result in tqdm(it, 'reducing', total=total_lines):
        if result:
            for d in result:
                s = json.dumps(d, ensure_ascii=False)
                print(s, file=fp)

HBox(children=(IntProgress(value=0, description='[files     ]', max=1734, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='mapping', max=1791185, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='reducing', max=1791185, style=ProgressStyle(description_width…






In [14]:
output_lines = single_text_file_line_count(OUTPUT_FILE, True)

print(f'输出语料样本数：{output_lines:,d}')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


输出语料样本数：3,550,426
