# 心里咨询论坛 Tokens 数分析

统计问题和回答的 tokens 数分布

## Importings

In [1]:
import os
import json
from copy import copy
from functools import partial
from itertools import chain
from multiprocessing import Pool

import pandas as pd
import sentencepiece as spm
from tqdm.auto import tqdm

from IPython.display import display

## Corpus

In [2]:
corpus_json_path = '/home/Public/yiren-scrapy-crawlers/data/[xinli001+jiandanxinli]-qa[191010].jsonl'

In [3]:
total = sum(1 for _ in tqdm(open(corpus_json_path), unit='line'))
print(f'总样本数: {total:,d}')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


总样本数: 200,526


### 预览语料数据格式

In [4]:
with open(corpus_json_path) as fp:
    for line in fp:
        data = json.loads(line)
        display(data)
        break

{'id': 355355,
 'url': 'http://www.jiandanxinli.com/questions/355355',
 'crawl_time': '2019-03-20 09:56:01',
 'title': '想做真正的自己',
 'text': '感觉自己一直像个演员，演绎一个大家都喜欢至少不能讨厌的人，很痛苦，不能放开做自己。',
 'time': '2019年3月19日 19:41',
 'read_num': None,
 'tags': [],
 'answers': [{'text': '恭喜你，想做真正的自己，这是好事啊！完全赞同，每天像个演员，演一个不是你自己的自己该是多么辛苦啊！可能你本身就是个很好很善良的人，但是你感觉从来没有被接纳，欣赏过你的本我，所以觉得自己没有价值，不被人喜欢，就连你自己也不喜欢自己。从和自己做朋友开始，慢慢接纳，喜欢，爱护自己，那么你在人前就可以慢慢地做回自己。不容易，但可以尝试。真正的自己是独一无二的，演绎的自己是赝品，是吗？',
   'user_name': '蒋世平',
   'user_title': '心理咨询师',
   'like_num': 0,
   'reward': None,
   'time': '2019-03-19 23:58:11',
   'comments': [],
   'comment_num': 0}],
 'answer_num': 1}

## Tokenizer

In [5]:
spm_model_path = '../data/spm/gpt2_huamei_corpus_bpe_32k_v2.model'

tokenizer = spm.SentencePieceProcessor()
tokenizer.load(spm_model_path)

True

## 计算 Tokens 数量

In [6]:
def get_tokens(line):
    line = line.strip()
    if not line:
        return []
    data = json.loads(line)
    answers = data.get('answers', [])
    d = {
        'question_title_length': len(tokenizer.encode_as_ids(data.get('title', '').strip())),
        'question_text_length': len(tokenizer.encode_as_ids(data.get('text', '').strip()))
    }
    if answers:
        result = []
        for answer_dict in answers:
            da = copy(d)
            da['answer_text_length'] = len(tokenizer.encode_as_ids(answer_dict.get('text', '').strip()))
            result.append(da)
        return result
    else:
        d['answer_text_length'] = 0
        return [d]



def iget_tokens():
    with open(corpus_json_path) as fp, Pool() as pool:
        iterable = tqdm(fp, desc='map', total=total)
        it_map = pool.imap_unordered(get_tokens, iterable)

        it_map = tqdm(it_map, desc='reduce', total=total)
        yield from chain.from_iterable(it_map)



df_tokens = pd.DataFrame(iget_tokens())

HBox(children=(IntProgress(value=0, description='map', max=200526, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='reduce', max=200526, style=ProgressStyle(description_width='i…





In [7]:
df_tokens.describe()

Unnamed: 0,answer_text_length,question_text_length,question_title_length
count,586431.0,586431.0,586431.0
mean,69.384792,112.099161,10.751984
std,125.887844,121.378417,4.304485
min,0.0,0.0,0.0
25%,14.0,44.0,7.0
50%,35.0,76.0,12.0
75%,81.0,145.0,14.0
max,40833.0,5043.0,42.0


## 绘图

In [None]:
import holoviews as hv
import hvplot.pandas  # noqa

import datashader as ds
import datashader.transfer_functions as tf

from holoviews.operation.datashader import datashade, shade, spread, dynspread, rasterize


In [None]:
points = hv.Points(df_tokens, ['index', 'question_title_length'])
dynspread(datashade(points))

In [None]:
points = hv.Points(df_tokens, ['index', 'question_text_length'])
dynspread(datashade(points))

In [None]:
points = hv.Points(df_tokens, ['index', 'answer_text_length'])
dynspread(datashade(points))