In [27]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import json as js
import seaborn as sns
import multiprocessing as mp
import re
import os
import itertools
import jieba
import jieba.posseg as posseg
from jieba.analyse import extract_tags
from jieba.analyse import textrank
from collections import defaultdict
from zhon.hanzi import punctuation as chinese_punc
from string import punctuation as english_punc
from pandarallel import pandarallel
pandarallel.initialize()
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
# this might be a bad habbit ..-
# import warnings
# warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [10]:
train_data_path = './data/doc_quality_data_train.json'
test_data_path = './data/doc_quality_data_test.json'
train_sample_path = './data/doc_sample_data_train.json'
test_size = 45286
train_size = 576454

different types of articles seems to be in quite different writing style, such as sentence length and number of paragraph (some writers like to write paragraph with only one sentence). 

## extract writing charateristics

In [3]:
if not os.path.exists(train_sample_path):
    with open(train_data_path, 'r') as f:
        s = []
        for i in range(100):
            s.append(f.readline())
    with open(train_sample_path, 'w') as f:
        f.writelines(s)

In [4]:
# what we need

# 标题
title_num_char = [] # 标题长度
title_num_word = [] # 标题词总数
title_num_keyword = [] # 标题关键词总数
title_ratio_keyword = [] # 标题关键词总数/标题词总数

# 字符层级
num_char = [] # 字符总数
num_char_del_stop = [] # 删除停词后的字符总数
num_uni_char = [] # 不同字符总数
ratio_uni_char = [] # 不同字符总数/字符总数
num_punc = [] # 标点符号总数
ratio_punc = [] # 标点符号总数/字符总数

# 词层级, 分词后
num_word = [] # 词总数
num_uni_word = [] # 不同词总数
ratio_uni_word = [] # 不同词总数/词总数

# 词性
num_noun = [] # 名词总数
num_adj = [] # 形容词总数
num_verb = [] # 动词总数
num_adv = [] # 副词总数
num_conj = [] # 连词总数
num_numeral = [] # 数词总数

ratio_noun = [] # 名词/词总数
ratio_adj = [] # 形容词/词总数
ratio_verb = [] # 动词/词总数
ratio_adv = [] # 副词/词总数
ratio_conj = [] # 连词/词总数
ratio_numeral = [] # 数词总数

# 段落
num_para = [] # 段落数
ratio_char_para = [] # 字符总数/段落数
ratio_word_para = [] # 词总数/段落数

In [6]:
punc = chinese_punc + english_punc
with open('./stopwords.txt', 'r') as f:
    stop_words = f.readlines()
stop_words = [ c[:-1] for c in stop_words ] + list(punc)
print(
    punc,
    stop_words[0:10],
    sep = '\n'
)

＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､　、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']


In [7]:
topK = 10
allowPOS = ['ns', 'n', 'vn', 'v','nr']
baseNum = 10000
'''
n* : noun,
v* : verb,
a* : adj,
d* : adv,
m* : numeral,
c* : conjucture
'''

'\nn* : noun,\nv* : verb,\na* : adj,\nd* : adv,\nm* : numeral,\nc* : conjucture\n'

In [8]:
cols = ['id', 'title', 'category', 'title_num_char', 'title_num_word',
       'title_num_keyword', 'num_char', 'num_char_del_stop', 'num_uni_char',
       'num_punc', 'num_word', 'num_uni_word', 'num_noun', 'num_adj',
       'num_verb', 'num_adv', 'num_conj', 'num_numeral', 'num_para'] # + ['doctype']

def extract_WC(cnt:int, l:str, template:str):
    if os.path.exists(template.format(cnt)):
        return
    
    line = js.loads(l)
    title_num_char = len(line['title'])
    num_char = len(line['body'])
    num_uni_char = len(set(line['body']))

    pos_body = list(posseg.cut(line['body']))
    num_uni_word = len(set(pos_body))
    num_word = len(pos_body)

    temp = defaultdict(int)
    for (w, p) in pos_body:
        #print(p[0], end = '')
        temp[p[0]] += 1
        if not w in stop_words:
            temp['nw'] += 1
    #print(temp)

    num_noun = temp['n']
    num_adj = temp['a']
    num_adv = temp['d']
    num_verb = temp['v']
    num_conj = temp['c']
    num_numeral = temp['m']
    num_punc = temp['x']
    num_char_del_stop = temp['nw']
    kws = extract_tags(
        line['body'],
        topK=topK,
        allowPOS=allowPOS
    )
    title_num_word = len(list(jieba.cut(line['title'])))
    for kw in kws:
        temp['kw'] += line['title'].count(kw)
    title_num_keyword = temp['kw']

    num_para = len(re.findall('\s*\n\s*', line['body'].strip())) + 1
    s = [
        line['id'],
        line['title'],
        line['category'],
        title_num_char,
        title_num_word,
        title_num_keyword,
        num_char,
        num_char_del_stop,
        num_uni_char,
        num_punc,
        num_word,
        num_uni_word,
        num_noun,
        num_adj,
        num_verb,
        num_adv,
        num_conj,
        num_numeral,
        num_para
    ]
    if 'doctype' in line:
        s.append(line['doctype'])
    with open(template.format(cnt), 'w') as f:
        f.write(','.join([str(i) for i in s]))

In [9]:
#test data
path = './data/raw_test_wc.csv'
if not os.path.exists(path):
    pool = mp.Pool(3)
    num = 0
    template = './data/test/test_{}.csv'
    with open(test_data_path, 'r') as f:
        for (cnt, l) in enumerate(f):
            pool.apply_async(func = extract_WC, args = (cnt, l, template))
            num += 1
            if num % 10000 == 0:
                print(f'done with {num} samples')
    pool.close()
    pool.join()

done with 10000 samples
done with 20000 samples
done with 30000 samples
done with 40000 samples


In [13]:
path = './data/raw_train_wc.csv'
if not os.path.exists(path):
    pool = mp.Pool(3)
    num = mp.Value('i', 0)
    template = './data/train/train_{}.csv'
    with open(train_data_path, 'r') as f:
        for (cnt, l) in enumerate(f):
            pool.apply_async(func = extract_WC, args = (cnt, l, template))
    pool.close()
    pool.join()

Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


## Merge files

In [76]:
# test files
path = './data/raw_test_wc.csv'
cols = ['id', 'title', 'category', 'title_num_char', 'title_num_word',
       'title_num_keyword', 'num_char', 'num_char_del_stop', 'num_uni_char',
       'num_punc', 'num_word', 'num_uni_word', 'num_noun', 'num_adj',
       'num_verb', 'num_adv', 'num_conj', 'num_numeral', 'num_para'] # + ['doctype']
ncols = len(cols)
df_list = []
ab_list = []
if not os.path.exists(path):
    for i in range(0, test_size):
        if not os.path.exists(f'./data/test/test_{i}.csv'):
            print(f'test_{i} not found')
            continue
        with open(f'./data/test/test_{i}.csv', 'r') as t:
            l = t.read().split(',')
            if len(l) == ncols:
                df_list.append(l)
            else:
                ab_list.append(l)
        if i % 10000 == 0:
            print(f'done with {i}/{test_size}')

print(len(ab_list), len(df_list))

#deal with abnormal samples
for l in ab_list:
    p = [l[0]] + [','.join(l[1 : len(l)-17])] + l[len(l)-17:]
    df_list.append(p)
print(len(df_list))

df = pd.DataFrame(
    df_list,
    columns = cols
)

df.to_csv(path, index = False, header = True)

done with 0/45286
done with 10000/45286
done with 20000/45286
done with 30000/45286
done with 40000/45286
test_45285 not found
4638 40647
45285


In [19]:
path = './data/raw_train_wc.csv'
if not os.path.exists(path):
    f = open(path, 'w')
    f.write(','.join(cols + ['doctype']) + '\n')
    for i in range(0, 576453):
        with open(f'./data/train/train_{i}.csv', 'r') as t:
            f.writelines(t.readlines())
        if i % 10000 == 0:
            print(f'done with {i}/576453')
    f.close()

In [72]:
path = './data/raw_train_wc.csv'
cols = ['id', 'title', 'category', 'title_num_char', 'title_num_word',
       'title_num_keyword', 'num_char', 'num_char_del_stop', 'num_uni_char',
       'num_punc', 'num_word', 'num_uni_word', 'num_noun', 'num_adj',
       'num_verb', 'num_adv', 'num_conj', 'num_numeral', 'num_para', 'doctype']
ncols = len(cols)
df_list = []
ab_list = []
if not os.path.exists(path):
    for i in range(0, train_size):
        if not os.path.exists(f'./data/train/train_{i}.csv'):
            print(f'train_{i} not found')
            continue
        with open(f'./data/train/train_{i}.csv', 'r') as t:
            l = t.read().split(',')
            if len(l) == ncols:
                df_list.append(l)
            else:
                ab_list.append(l)
        if i % 100000 == 0:
            print(f'done with {i}/{train_size}')

print(len(ab_list), len(df_list))


done with 0/576454
done with 100000/576454
done with 200000/576454
done with 300000/576454
done with 400000/576454
done with 500000/576454
26651 549803


In [73]:
#deal with abnormal samples
for l in ab_list:
    p = [l[0]] + [','.join(l[1 : len(l)-18])] + l[len(l)-18:]
    #print(l,p,sep = '\n',end = '\n\n')
    df_list.append(p)
print(len(df_list))

576454


In [74]:
df = pd.DataFrame(
    df_list,
    columns = cols
)

df.to_csv(path, index = False, header = True)

## decide doc length and sentence length

In [2]:
from utils.preprocessor import Preprocessor

In [4]:
bert_version = 'bert-base-chinese'
prep = Preprocessor(bert_version)

In [22]:
p = './data/training_set.csv'
ds = pd.read_csv(p)
ds = ds[ds.body.notnull()]

In [19]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 576466 entries, 0 to 576508
Data columns (total 33 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   576456 non-null  object 
 1   title                576466 non-null  object 
 2   body                 576466 non-null  object 
 3   category             576454 non-null  float64
 4   title_num_char       576454 non-null  float64
 5   title_num_word       576454 non-null  float64
 6   title_num_keyword    576454 non-null  float64
 7   num_char             576454 non-null  float64
 8   num_char_del_stop    576454 non-null  float64
 9   num_uni_char         576454 non-null  float64
 10  num_punc             576454 non-null  float64
 11  num_word             576454 non-null  float64
 12  num_uni_word         576454 non-null  float64
 13  num_noun             576454 non-null  float64
 14  num_adj              576454 non-null  float64
 15  num_verb         

In [23]:
ds.body = ds.title + '。' + ds.body

In [28]:
def count_sent_char(d):
    s = prep.cut_doc(d)
    print(i)
    return [len(i) for i in s]

def count_doc_sent(d):
    return len(prep.cut_doc(d))

def not_str(d):
    return not isinstance(d,str)

bad_body = ds.body[ds.body.apply(not_str)]
bad_body.sum()

0

In [None]:
num_doc_sent = ds.body.parallel_apply(count_doc_sent)
l = ds.body.parallel_apply(count_sent_char)
num_sent_char = [x for y in l for x in y]

In [None]:
print(num_doc_sent)