In [1]:
## This notebook is to pre-process data,and store as new input data for model

In [20]:
from glob import glob
import os
import pandas as pd
from collections import defaultdict
import numpy as np
import re

import xlrd

from xlrd import XLRDError
from xlrd.compdoc import CompDocError
from pandas.errors import ParserError

from tqdm import tqdm_notebook
import pickle

In [135]:
KEEP_LEN = 300 # 预处理数据是每个文件保留的最长字符数
RANDOM_SAMPLE_NUM = 5 # 对于给个输入文件，做随机数据增强的次数
PUNCT_SET = set("#《》【】[]") # 保留这些预定义的标点

In [22]:
answer_df = pd.read_csv("../input/answer_train.csv")
test_df = pd.read_csv("../input/submit_example_test1.csv")
test2_df = pd.read_csv("../input/submit_example_test2.csv")

In [39]:
with open('../tmp_input/pinyin_map_1206.pkl', 'rb') as f:
    pinyin_chinese_map, add_tokens = pickle.load(f)

In [24]:
len(pinyin_chinese_map), len(add_tokens)

(2835, 563)

In [29]:
import random

In [42]:
pinyin_chinese_map.pop('xb')

'新办'

In [44]:
pinyin_chinese_map.pop('mfmc')

'民非名称'

In [62]:
def _find_substr_match(col, substr_len):
    """发现有些很长的纯拼音字符串，尝试substring匹配，来找到对应中文"""
    for i in range(len(col)-substr_len + 1):
        if col[i:i+substr_len] in pinyin_chinese_map and (i %2 != 1 or i+substr_len == len(col)):
            return pinyin_chinese_map[col[i:i+substr_len]]
    return None

In [80]:
def find_substr_match(col):
    """发现有些很长的纯拼音字符串，尝试substring匹配，来找到对应中文"""
    mapped_chinese = None
    if re.sub(r'[a-z0-9]', '', col.lower()) == "" and len(col) > 4 and col not in pinyin_chinese_map:
        for substr_len in range(10, 3, -1):
            mapped_chinese = _find_substr_match(col, substr_len)
            if mapped_chinese:
                return mapped_chinese
    return None

In [82]:
def is_chinese(uchar: str) -> bool:
    # 暂时保留以下字符，看看CV是否提高
    if uchar in PUNCT_SET:
        return True
    if uchar >= '\u4e00' and uchar <= '\u9fa5':
        return True
    else:
        return False

def reserve_chinese(content: str, threshold: int = 512) -> str:
    content_str = ''
    c = 0
    for i in content:
        if c == threshold:
            break
        if is_chinese(i):
            content_str += i
            c += 1
    return content_str

def filter_cols(cols):
    res = []
    for col in cols:
        if col == 'index' or 'Unnamed' in col:
            continue
        # For following cases, check if they exists in `pinyin_chinese_map`
        # if so, replace them with chinese characters
        # if not, ignore them
        col_lower = col.lower()
        if re.sub(r'[a-z0-9_]', '', col_lower) == "":
            if col_lower in pinyin_chinese_map:
                # 匹配中文
                res.append(pinyin_chinese_map[col_lower])
                continue
            matched_chinese = find_substr_match(col_lower)
            if matched_chinese:
                res.append(matched_chinese)
            elif col_lower in add_tokens:
                res.append(col_lower)
        # Keep long header, like HTML cases, 只保留其中的中文
        else:
            res.append(reserve_chinese(col, -1))
    return res

def get_content_from_cell_values(cell_values):
    """cell_values is a 1d numpy array"""
    content = "".join(cell_values)
    content = reserve_chinese(content, KEEP_LEN)
    return content

def process_header_and_content(cols, cell_values, how='first'):
    """Data Augmentation based on different methods."""
    cols = filter_cols(cols)
    # Based on different method, prepare header and content string
    if how == 'first':
        cell_values = cell_values.ravel()
    elif how == 'reverse_rows':
        cell_values = cell_values[::-1].ravel()
    elif how == 'random_rows':
        np.random.shuffle(cell_values)
        cell_values = cell_values.ravel()
    elif how == 'random':
        np.random.shuffle(cols)
        cell_values = cell_values.ravel()
        np.random.shuffle(cell_values)
    header = "".join(cols)
    cell_values = cell_values.astype(str)
    content = get_content_from_cell_values(cell_values)
    return header, content

def get_text_use_xlrd(xl_workbook, how='first'):
    """`how` can be 'first', 'last' and 'random'"""
    text = ""
    for sheet_name in xl_workbook.sheet_names():
        xl_sheet = xl_workbook.sheet_by_name(sheet_name)
        if xl_sheet.nrows > 0:
            # Get header & content cell_values
            cols = np.array(xl_sheet._cell_values[0]).astype(str)
            cell_values = np.array(xl_sheet._cell_values[1:])
            header, content = process_header_and_content(cols, cell_values, how)
            text += reserve_chinese(sheet_name) + header + content
            if len(text) >= KEEP_LEN:
                text = text[:KEEP_LEN]
                break
    return text


def get_text_from_dataframe(df, how='first'):
    """`how` can be 'first', 'last' and 'random'"""
    # Get header & content cell_values
    cols = list(df)
    cell_values = df.values
    header, content = process_header_and_content(cols, cell_values, how)
    text = header + content
    text = text[:KEEP_LEN]
    return text

def get_xls_texts(filename):
    xl_workbook = xlrd.open_workbook(filename, on_demand = True)
    texts = []
    texts.append(get_text_use_xlrd(xl_workbook, how='first'))
    texts.append(get_text_use_xlrd(xl_workbook, how='reverse_rows'))
    texts.append(get_text_use_xlrd(xl_workbook, how='random_rows'))
    for _ in range(RANDOM_SAMPLE_NUM):
        texts.append(get_text_use_xlrd(xl_workbook, how='random'))
    xl_workbook.release_resources()
    return texts

def get_csv_texts(filename):
    if filename in {'../input/train/社区服务中心.csv', '../input/test2/3a005470f08021e03ccd9317d23021b9.csv'}:
        df = pd.read_csv(filename, nrows=100, usecols = range(4), error_bad_lines=False).reset_index().fillna("")
    else:
        df = pd.read_csv(filename, nrows=100, error_bad_lines=False).reset_index().fillna("")
    texts = []
    texts.append(get_text_from_dataframe(df, how='first'))
    texts.append(get_text_from_dataframe(df, how='reverse_rows'))
    texts.append(get_text_from_dataframe(df, how='random_rows'))
    for _ in range(RANDOM_SAMPLE_NUM):
        texts.append(get_text_from_dataframe(df, how='random'))
    return texts

In [78]:
def preprocess(df):
    """数据预处理,读取所有xls和csv文件,返回dictionary: {filename: list of texts}"""
    texts_augment = {}
    for _, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    # for _, row in tqdm_notebook(df.head(2000).tail(10).iterrows(), total=10):
        filename = rf"../input/{row['filename']}"
        if filename[-3:] == 'xls':
            try:
                texts = get_xls_texts(filename)
            except:
                try:
                    texts = get_csv_texts(filename)
                except (pd.io.common.EmptyDataError, XLRDError, UnicodeDecodeError, pd.errors.ParserError, CompDocError):
                    print(filename)
                    texts = ["ERROR"] * (3 + RANDOM_SAMPLE_NUM)
        elif filename[-3:] == 'csv':
            try:
                texts = get_csv_texts(filename)
            except (pd.io.common.EmptyDataError, XLRDError, UnicodeDecodeError, pd.errors.ParserError, CompDocError):
                print(filename)
                texts = ["ERROR"] * (3 + RANDOM_SAMPLE_NUM)
        texts_augment[row['filename']] = texts
    return texts_augment

In [85]:
%%time
train_texts_augment = preprocess(answer_df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=60000.0), HTML(value='')))

b'Skipping line 9: expected 9 fields, saw 10\nSkipping line 38: expected 9 fields, saw 10\nSkipping line 51: expected 9 fields, saw 10\nSkipping line 83: expected 9 fields, saw 10\nSkipping line 87: expected 9 fields, saw 10\nSkipping line 91: expected 9 fields, saw 10\nSkipping line 102: expected 9 fields, saw 10\nSkipping line 103: expected 9 fields, saw 10\nSkipping line 104: expected 9 fields, saw 10\n'
b'Skipping line 13: expected 18 fields, saw 19\nSkipping line 14: expected 18 fields, saw 19\nSkipping line 15: expected 18 fields, saw 19\nSkipping line 27: expected 18 fields, saw 19\nSkipping line 28: expected 18 fields, saw 19\nSkipping line 69: expected 18 fields, saw 19\nSkipping line 71: expected 18 fields, saw 19\n'
b'Skipping line 9: expected 9 fields, saw 10\nSkipping line 38: expected 9 fields, saw 10\nSkipping line 51: expected 9 fields, saw 10\nSkipping line 83: expected 9 fields, saw 10\nSkipping line 87: expected 9 fields, saw 10\nSkipping line 91: expected 9 fields, 

../input/train/关于公布黄山市年度餐饮服务食品安全监察量化分级管理A级公司评审成果的通知.xls


b'Skipping line 68: expected 10 fields, saw 11\n'
b'Skipping line 4: expected 7 fields, saw 8\nSkipping line 17: expected 7 fields, saw 8\nSkipping line 21: expected 7 fields, saw 8\nSkipping line 29: expected 7 fields, saw 8\nSkipping line 45: expected 7 fields, saw 8\nSkipping line 80: expected 7 fields, saw 8\n'
b'Skipping line 15: expected 8 fields, saw 9\n'
b'Skipping line 52: expected 8 fields, saw 9\n'
b'Skipping line 10: expected 9 fields, saw 10\nSkipping line 11: expected 9 fields, saw 10\nSkipping line 12: expected 9 fields, saw 10\nSkipping line 13: expected 9 fields, saw 10\n'
b'Skipping line 5: expected 11 fields, saw 13\nSkipping line 11: expected 11 fields, saw 13\nSkipping line 74: expected 11 fields, saw 12\nSkipping line 91: expected 11 fields, saw 13\nSkipping line 92: expected 11 fields, saw 13\n'
b'Skipping line 4: expected 12 fields, saw 13\nSkipping line 8: expected 12 fields, saw 13\n'
b'Skipping line 32: expected 14 fields, saw 15\nSkipping line 35: expected 1

../input/train/关于公布黄山市年度餐饮服务食品安全监管量化分级管理A级公司评审效果的通知.xls


b'Skipping line 15: expected 8 fields, saw 9\nSkipping line 36: expected 8 fields, saw 9\nSkipping line 46: expected 8 fields, saw 9\nSkipping line 57: expected 8 fields, saw 9\n'
b'Skipping line 26: expected 8 fields, saw 9\n'
b'Skipping line 4: expected 6 fields, saw 8\nSkipping line 18: expected 6 fields, saw 8\nSkipping line 22: expected 6 fields, saw 7\nSkipping line 36: expected 6 fields, saw 7\nSkipping line 41: expected 6 fields, saw 7\nSkipping line 42: expected 6 fields, saw 7\nSkipping line 53: expected 6 fields, saw 8\nSkipping line 73: expected 6 fields, saw 8\nSkipping line 80: expected 6 fields, saw 7\nSkipping line 88: expected 6 fields, saw 8\nSkipping line 92: expected 6 fields, saw 8\nSkipping line 105: expected 6 fields, saw 7\n'
b'Skipping line 41: expected 17 fields, saw 18\n'
b'Skipping line 10: expected 4 fields, saw 5\n'
b'Skipping line 58: expected 2 fields, saw 3\nSkipping line 82: expected 2 fields, saw 3\n'
b'Skipping line 5: expected 11 fields, saw 13\nSki

../input/train/关于公布黄山市年度餐饮服务食品安全监管量化分级管理A级企业评审成果的通知.xls


b'Skipping line 5: expected 7 fields, saw 8\nSkipping line 13: expected 7 fields, saw 8\nSkipping line 20: expected 7 fields, saw 9\nSkipping line 31: expected 7 fields, saw 8\nSkipping line 32: expected 7 fields, saw 8\nSkipping line 34: expected 7 fields, saw 8\nSkipping line 46: expected 7 fields, saw 8\nSkipping line 72: expected 7 fields, saw 8\n'
b'Skipping line 6: expected 23 fields, saw 24\nSkipping line 7: expected 23 fields, saw 25\nSkipping line 8: expected 23 fields, saw 24\n'
b'Skipping line 27: expected 8 fields, saw 10\n'
b'Skipping line 26: expected 8 fields, saw 9\n'
b'Skipping line 61: expected 9 fields, saw 10\nSkipping line 62: expected 9 fields, saw 10\nSkipping line 63: expected 9 fields, saw 10\nSkipping line 81: expected 9 fields, saw 10\n'
b'Skipping line 5: expected 4 fields, saw 5\nSkipping line 6: expected 4 fields, saw 5\n'
b'Skipping line 3: expected 5 fields, saw 6\n'
b'Skipping line 10: expected 9 fields, saw 10\nSkipping line 11: expected 9 fields, saw 

b'Skipping line 15: expected 8 fields, saw 9\n'
b'Skipping line 5: expected 2 fields, saw 4\n'
b'Skipping line 3: expected 10 fields, saw 11\nSkipping line 7: expected 10 fields, saw 11\nSkipping line 11: expected 10 fields, saw 11\nSkipping line 25: expected 10 fields, saw 12\nSkipping line 26: expected 10 fields, saw 11\nSkipping line 36: expected 10 fields, saw 11\nSkipping line 41: expected 10 fields, saw 11\nSkipping line 42: expected 10 fields, saw 11\nSkipping line 43: expected 10 fields, saw 11\nSkipping line 44: expected 10 fields, saw 11\nSkipping line 45: expected 10 fields, saw 11\nSkipping line 47: expected 10 fields, saw 11\n'


INFO: Trying to access sector 256 but only 188 available
../input/train/兰山区文化市场管理执法局年月月行政处置信息_.xls


b'Skipping line 26: expected 6 fields, saw 9\nSkipping line 30: expected 6 fields, saw 8\nSkipping line 32: expected 6 fields, saw 9\n'
b'Skipping line 29: expected 7 fields, saw 8\n'


../input/train/关于公布黄山市年度餐饮服务食品安全监管量化分级管理A级公司评审成果的通知.xls


b'Skipping line 35: expected 3 fields, saw 5\nSkipping line 39: expected 3 fields, saw 4\nSkipping line 42: expected 3 fields, saw 6\nSkipping line 43: expected 3 fields, saw 6\nSkipping line 49: expected 3 fields, saw 5\nSkipping line 54: expected 3 fields, saw 4\nSkipping line 56: expected 3 fields, saw 5\nSkipping line 57: expected 3 fields, saw 4\nSkipping line 59: expected 3 fields, saw 7\nSkipping line 62: expected 3 fields, saw 7\nSkipping line 64: expected 3 fields, saw 6\nSkipping line 68: expected 3 fields, saw 5\nSkipping line 69: expected 3 fields, saw 4\nSkipping line 73: expected 3 fields, saw 5\nSkipping line 75: expected 3 fields, saw 4\nSkipping line 78: expected 3 fields, saw 7\nSkipping line 83: expected 3 fields, saw 5\nSkipping line 86: expected 3 fields, saw 5\nSkipping line 90: expected 3 fields, saw 4\nSkipping line 92: expected 3 fields, saw 5\nSkipping line 97: expected 3 fields, saw 6\nSkipping line 101: expected 3 fields, saw 8\nSkipping line 102: expected 3

../input/train/长岛县年度估算执行和专项审计工程策划归纳表_.xls


b'Skipping line 9: expected 11 fields, saw 14\nSkipping line 20: expected 11 fields, saw 14\nSkipping line 25: expected 11 fields, saw 12\nSkipping line 45: expected 11 fields, saw 15\nSkipping line 73: expected 11 fields, saw 12\n'
b'Skipping line 42: expected 3 fields, saw 4\n'
b'Skipping line 49: expected 4 fields, saw 5\n'


../input/train/长岛县年度估算执行和专项审计工程筹划总结表_.xls
../input/train/关于公布黄山市年度餐饮服务食品安全监察量化分级管理A级企业评审效果的通知.xls


b'Skipping line 37: expected 9 fields, saw 12\nSkipping line 60: expected 9 fields, saw 12\nSkipping line 67: expected 9 fields, saw 12\n'


../input/train/武汉市安全生产监管管理局各区组织 .csv


b'Skipping line 4: expected 7 fields, saw 8\nSkipping line 17: expected 7 fields, saw 8\nSkipping line 21: expected 7 fields, saw 8\nSkipping line 29: expected 7 fields, saw 8\nSkipping line 45: expected 7 fields, saw 8\nSkipping line 80: expected 7 fields, saw 8\n'
b'Skipping line 5: expected 6 fields, saw 8\nSkipping line 12: expected 6 fields, saw 7\nSkipping line 51: expected 6 fields, saw 9\nSkipping line 54: expected 6 fields, saw 7\nSkipping line 55: expected 6 fields, saw 8\nSkipping line 66: expected 6 fields, saw 10\n'
b'Skipping line 3: expected 10 fields, saw 11\nSkipping line 7: expected 10 fields, saw 11\nSkipping line 11: expected 10 fields, saw 11\nSkipping line 25: expected 10 fields, saw 12\nSkipping line 26: expected 10 fields, saw 11\nSkipping line 36: expected 10 fields, saw 11\nSkipping line 41: expected 10 fields, saw 11\nSkipping line 42: expected 10 fields, saw 11\nSkipping line 43: expected 10 fields, saw 11\nSkipping line 44: expected 10 fields, saw 11\nSkipp


Wall time: 23min 23s


In [86]:
answer_df['text'] = answer_df.filename.map(train_texts_augment)

In [87]:
answer_df.to_csv("../tmp_input/train_df_processed_1207_aug_5_chinese.csv", index=False)

In [133]:
df1206 = pd.read_csv("../output/content_only_prediction_1206.csv")

In [136]:
%%time
test_texts_augment = preprocess(df1206)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=7399.0), HTML(value='')))

b'Skipping line 96: expected 11 fields, saw 12\n'
  exec(code, glob, local_ns)
b'Skipping line 19: expected 7 fields, saw 8\nSkipping line 25: expected 7 fields, saw 9\nSkipping line 43: expected 7 fields, saw 8\nSkipping line 47: expected 7 fields, saw 10\nSkipping line 61: expected 7 fields, saw 8\nSkipping line 80: expected 7 fields, saw 9\nSkipping line 81: expected 7 fields, saw 10\nSkipping line 90: expected 7 fields, saw 8\nSkipping line 91: expected 7 fields, saw 9\n'
b'Skipping line 81: expected 6 fields, saw 7\n'
b'Skipping line 58: expected 7 fields, saw 8\nSkipping line 61: expected 7 fields, saw 9\nSkipping line 64: expected 7 fields, saw 9\nSkipping line 83: expected 7 fields, saw 8\nSkipping line 87: expected 7 fields, saw 8\nSkipping line 88: expected 7 fields, saw 8\nSkipping line 89: expected 7 fields, saw 8\nSkipping line 90: expected 7 fields, saw 8\n'
b'Skipping line 41: expected 11 fields, saw 12\n'
b'Skipping line 12: expected 10 fields, saw 13\nSkipping line 17:

b'Skipping line 16: expected 14 fields, saw 15\nSkipping line 50: expected 14 fields, saw 15\n'



Wall time: 6min 27s


In [137]:
test2_df['text'] = test2_df.filename.map(test_texts_augment)
test2_df.to_csv("../tmp_input/test_df_processed_1207_chinese.csv", index=False)