# 文段句格式语料转wikipedia格式jsonlines

原始预料的格式是 JSON Lines，每一行是一篇文章，其内容是二维数组，第一个维度是段落，第二个维度是句子

## 代码准备

### Imporings

In [1]:
import json
import math
import os
import sys
from contextlib import ExitStack
from datetime import timedelta
from functools import partial
from glob import glob, iglob
from itertools import chain, cycle, islice, count
from multiprocessing import Pool
from time import time

import numpy as np
import sentencepiece as spm
from tqdm.auto import tqdm


### Constants

In [2]:
SEQ_LENGTH = 1024
MIN_CTX_LEN = 128

SPM_MODEL = '../data/spm/gpt2_huamei_corpus_bpe_32k_v2.model'


SP = spm.SentencePieceProcessor()
SP.load(SPM_MODEL)

True

### Functions

In [3]:
def text_files_line_iterator(paths):
    return chain.from_iterable(
        open(path)
        for path
        in tqdm(paths, '[iter files]', unit='file')
    )


def single_text_file_line_count(path, show_progress_bar=False):
    with open(path) as fd:
        iterable = tqdm(fd) if show_progress_bar else fd
        return sum(1 for _ in iterable)
        

def text_files_line_count(paths):
    try:
        total = len(paths)
    except (AttributeError, TypeError):
        total = None
    with Pool() as pool:
        it = pool.imap_unordered(
            single_text_file_line_count,
            tqdm(paths, '[map files ]', unit='file')
        )
        return sum(c for c in tqdm(it, '[sum files ]', unit='file', total=total))


def proc_line(line):
    result = []
    line = line.strip()
    if not line:
        return result
    paragraphs = json.loads(line)
    text = ''
    n_text = 0
    for sentence in chain.from_iterable(paragraphs):
        sentence = sentence.strip()
        if not sentence:
            continue
        n_sentence = len(SP.encode_as_ids(sentence))
        if n_text + n_sentence < SEQ_LENGTH + MIN_CTX_LEN // 2:
            text += sentence
            n_text += n_sentence
        else:
            result.append({'text': text, 'length': n_text})
            text = sentence
            n_text = n_sentence
    if n_text:
        result.append({'text': text, 'length': n_text})
    return result
    

## 语料文件

### 输入文件

### 列出输入文件

In [4]:
INPUT_FILES = [
    path
    for path in tqdm(iglob(
        "/nfs/server01_public/data/gpt2/output/gpt2_huamei_corpus.json.8g/**/*.*",
        recursive=True
    ))
    if os.path.isfile(path) and os.path.splitext(path)[1].lower() in ('.jsonl', '.jsonlines', 'json', 'jsonline')
]
print(f'源语料文件数：{len(INPUT_FILES):,d}')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


源语料文件数：379


### 统计输入文件总行数

In [5]:
%%time

total_lines = text_files_line_count(INPUT_FILES)
print(f'源语料行数：{total_lines:,d}')

HBox(children=(IntProgress(value=0, description='[map files ]', max=379, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='[sum files ]', max=379, style=ProgressStyle(description_width…



源语料行数：1,710,236
CPU times: user 142 ms, sys: 48 ms, total: 190 ms
Wall time: 2.5 s


### 输出文件

In [6]:
OUTPUT_FILE = '../data/gpt2_huamei_corpus_emotion.jsonl'

## 执行

In [7]:
lines_iterator = text_files_line_iterator(INPUT_FILES)
n_samples = 0
n_discard = 0

with Pool() as pool, \
     open(OUTPUT_FILE, 'w') as fp:
    it = pool.imap_unordered(
        proc_line,
        tqdm(lines_iterator, 'map lines', total=total_lines),
        chunksize=512
    )
    for result in tqdm(it, 'reduce all', total=total_lines):
        for d in result:
            if d['length'] < MIN_CTX_LEN:
                n_discard += 1
                continue
            s = json.dumps(d, ensure_ascii=False)
            n_samples += 1
            print(s, file=fp)

print(f'得到语料样本数：{n_samples:,d}')
print(f'抛弃语料样本数：{n_discard:,d}')

HBox(children=(IntProgress(value=0, description='[iter files]', max=379, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='map all lines', max=1710236, style=ProgressStyle(description_…

HBox(children=(IntProgress(value=0, description='reduce all', max=1710236, style=ProgressStyle(description_wid…




得到语料样本数：2,582,523
抛弃语料样本数：200,324


### 查看输出文件

In [8]:
x = !wc -l {OUTPUT_FILE}
n_samples = int(x[0].split()[0])

print(x[0])

!du -h {OUTPUT_FILE}

2582523 ../data/gpt2_huamei_corpus_emotion.jsonl
8.0G	../data/gpt2_huamei_corpus_emotion.jsonl


## 长度统计

## 统计绘图软件包

采用 holoviz

见：  <http://holoviews.org/user_guide/Large_Data.html>

**如果没有安装，运行**：

In [None]:
%conda install -y -c defaults -c conda-forge -c pyviz holoviz

In [None]:
import json

import pandas as pd
import holoviews as hv
import hvplot.pandas  # noqa

import datashader as ds
import datashader.transfer_functions as tf

from holoviews.operation.datashader import datashade, shade, spread, dynspread, rasterize, spread
from tqdm.auto import tqdm

# hv.extension('bokeh')

def iter_corpus_length():
    with open(OUTPUT_FILE) as fp:
        for line in tqdm(fp, total=n_samples):
            data = json.loads(line)
            yield {'length': data['length']}


In [10]:
df = pd.DataFrame(iter_corpus_length())

HBox(children=(IntProgress(value=0, max=2582523), HTML(value='')))




In [11]:
df.describe()

Unnamed: 0,length
count,2582523.0
mean,772.1102
std,326.2599
min,128.0
25%,482.0
50%,898.0
75%,1069.0
max,12984.0


In [12]:
points = hv.Points(df, ['index', 'length'])
spread(datashade(points))