In [1]:
## 文件分布：
# 14892 个数据可以通过直接匹配训练集中的数据来获取结果
# 2936 个“无访问权限”的数据可以通过简单的规则来获取结果
# 7388 个只有内容的数据，通过训练bert模型来预测
# 223 个带有文件名的模型，训练一个小的bert模型来预测

In [1]:
from glob import glob
import os
import pandas as pd
from collections import defaultdict
import numpy as np
import re

import xlrd
from xlrd import XLRDError
from xlrd.compdoc import CompDocError
from pandas.errors import ParserError

from tqdm import tqdm_notebook
import pickle

In [2]:
test_df = pd.read_csv("../input/submit_example_test2.csv")
test_df['filetype'] = test_df.filename.apply(lambda x: str(x)[-3:])
test_df['filetype'].value_counts()

csv    13395
xls    12044
Name: filetype, dtype: int64

In [3]:
test_df.shape

(25439, 3)

# 准备匹配训练集的规则

In [4]:
PUNCT_SET = set("#《》【】[]") # 保留这些预定义的标点
def is_chinese(uchar: str) -> bool:
    # 暂时保留以下字符，看看CV是否提高
    if uchar in PUNCT_SET:
        return True
    if uchar >= '\u4e00' and uchar <= '\u9fa5':
        return True
    else:
        return False

def reserve_chinese(content: str, threshold: int = 512) -> str:
    content_str = ''
    c = 0
    for i in content:
        if c == threshold:
            break
        if is_chinese(i):
            content_str += i
            c += 1
    return content_str

In [5]:
sub_df = pd.read_csv("../input/submit_example_test2.csv")

In [6]:
test2_df = pd.read_csv("../tmp_input/test_df_processed_1206_chinese.csv")
train_df = pd.read_csv("../tmp_input/train_df_processed_1206_aug_5_chinese.csv")

train_text_df = pd.DataFrame(train_df['text'].apply(eval).to_list(), columns=[f'text{i}' for i in range(1, 9)])
train_text_df['label'] = train_df['label']
test_text_df = pd.DataFrame(test2_df['text'].apply(eval).to_list(), columns=[f'text{i}' for i in range(1, 9)])

In [7]:
test_text_df['filename'] = test2_df['filename']

## 用content匹配训练集中的数据，如果发现content完全相同且训练集中label没有冲突，则直接匹配label

In [8]:
train_text_count = train_text_df.text1.value_counts()
test_text_count = test_text_df.text1.value_counts()

In [9]:
%%time
## 基于训练集的内容和标签进行统计, 将符合以下规则的保存到unique_map中
## 1.内容出现次数超过一次且标签没有冲突的
## 2. 内容出现次数仅为一次
## Key is content, value is label
unique_map = {}
for text in train_text_count[train_text_count > 1].index:
    subdf = train_text_df[train_text_df.text1 == text]
    if len(subdf['label'].value_counts()) == 1:
        label = subdf['label'].unique()[0]
        unique_map[text] = label
len(unique_map)

Wall time: 21.5 s


2552

In [10]:
freq1_text_set = set(train_text_count[train_text_count == 1].index)
freq1_df = train_text_df[train_text_df.text1.isin(freq1_text_set)]
unique_map.update(pd.Series(freq1_df.label.values, index=freq1_df.text1).to_dict())
len(unique_map)

20592

In [11]:
mapped_train_df = test_text_df[test_text_df.text1.isin(unique_map)]
mapped_train_df['label'] = mapped_train_df.text1.map(unique_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
mapped_train_df.shape

(14892, 10)

In [13]:
assert mapped_train_df.label.isnull().sum() == 0

In [14]:
## 因为12/04处理的时候，没有去匹配在train中只出现一次的content，所以漏掉了10个sample，这10个sample应该会加到LB上
subdf1 = pd.read_csv("../output/sub1205.csv")
check_df = mapped_train_df.merge(subdf1, on='filename', suffixes=("", "_sub"))
assert check_df[check_df.label != check_df.label_sub].shape[0] == 10

## 对于'无访问权限'，从文件名可以容易推断出label,用规则来匹配

In [17]:
permission_df = test_text_df[test_text_df.text1 == '无访问权限']

In [18]:
permission_df.shape

(2936, 9)

In [19]:
def permission_rules(x):
    x = str(x)
    if '残疾人证办理' in x:
        return '医疗卫生'
    if '参股企业名录' in x or '参股公司目录' in x:
        return '工业'
    if '执法证件信息' in x:
        return '政法监察'
    if '民声阅办记录信息' in x:
        return '文化休闲'

permission_df['label'] = permission_df.filename.apply(permission_rules)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [20]:
assert permission_df.label.isnull().sum() == 0

## 对于剩下的数据，根据有没有title进行划分

In [21]:
rest_df = test_text_df[(~test_text_df.text1.isin(unique_map)) & (test_text_df.text1 != '无访问权限')]

In [22]:
assert rest_df.shape[0] + permission_df.shape[0] + mapped_train_df.shape[0] == test_text_df.shape[0]

In [23]:
def is_masked_filename(x):
    if len(x) == 32 and re.sub(r'[a-z0-9]', '', x) == "":
        return True
    return False

In [24]:
rest_df['filename_str'] = rest_df.filename.apply(lambda x: x[6:-4])
rest_df['is_masked'] = rest_df.filename_str.apply(is_masked_filename)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [69]:
rest_df_with_title = rest_df[rest_df.is_masked == False]
rest_df_content_only = rest_df[rest_df.is_masked == True]

In [2]:
# rest_df_with_title.to_csv("../tmp_input/rest_df_with_title_1206_chinese.csv", index=False)

In [3]:
# rest_df_content_only.to_csv("../tmp_input/rest_df_content_only_1206_chinese.csv", index=False)

In [70]:
rest_df_with_title.shape, rest_df_content_only.shape

((223, 11), (7388, 11))

In [71]:
rest_df_with_title['text'] = rest_df_with_title['filename_str'].apply(reserve_chinese) + rest_df_with_title['text1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## 读取两个模型对于title和content的预测

In [72]:
title_df = pd.read_csv("../output/title_predictions.csv")
rest_df_with_title = rest_df_with_title.merge(title_df, how='left', on='filename')

In [74]:
content_df = pd.read_csv('../output/content_predictions.csv')
rest_df_content_only = rest_df_content_only.merge(content_df, on='filename', how='left')

In [77]:
assert rest_df_with_title.shape[0] + rest_df_content_only.shape[0] + permission_df.shape[0] + mapped_train_df.shape[0] == test_text_df.shape[0]

In [79]:
concat_df = pd.concat([rest_df_with_title[['filename', 'label']],
            rest_df_content_only[['filename', 'label']],
            permission_df[['filename', 'label']],
            mapped_train_df[['filename', 'label']]])

In [81]:
sub1207 = pd.read_csv("../input/submit_example_test2.csv")

In [82]:
sub1207 = sub1207.merge(concat_df, on='filename', suffixes=("_old", "")).drop('label_old', axis=1)

In [40]:
sub1207.to_csv("../output/sub1207.csv", index=False)