In [2]:
RUN_TEST = True

In [3]:
from time import time
start_time = time()

In [4]:
import tensorflow as tf
import transformers

print(tf.__version__)
print(transformers.__version__)



2.2.0
2.11.0


In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
elif len(gpus) > 1: # multiple GPUs in one VM
    strategy = tf.distribute.MirroredStrategy(gpus)
else: # default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  ['10.0.0.2:8470']
REPLICAS:  8


In [6]:
PUNCT_SET = set("#《》【】[]") # 保留这些预定义的标点
def is_chinese(uchar: str) -> bool:
    # 暂时保留以下字符，看看CV是否提高
    if uchar in PUNCT_SET:
        return True
    if uchar >= '\u4e00' and uchar <= '\u9fa5':
        return True
    else:
        return False

def reserve_chinese(content: str, threshold: int = 512) -> str:
    content_str = ''
    c = 0
    for i in content:
        if c == threshold:
            break
        if is_chinese(i):
            content_str += i
            c += 1
    return content_str

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re
import os
import pickle
from tqdm.notebook import tqdm

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# TENSORFLOW
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D, GlobalMaxPooling1D

# HUGGINGFACE
from tokenizers import BertWordPieceTokenizer
from transformers import TFAutoModel, AutoTokenizer, TFBertModel
from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
from transformers import AdamWeightDecay

In [8]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
N_FOLDS = 10
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
MAX_LEN = 192
NUM_AUG = 8

MODEL_NAME = 'bert-base-chinese'

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [12]:
labels = ['文化休闲', '医疗卫生', '教育科技', '城乡建设', '工业', '交通运输', '生态环境', '经济管理',
       '政法监察', '农业畜牧业', '文秘行政', '劳动人事', '信息产业', '民政社区', '旅游服务', '商业贸易',
       '气象水文测绘地震地理', '资源能源', '财税金融', '外交外事']

label_map, inv_label_map = {}, {}
for idx, label in enumerate(labels):
    label_map[label] = idx
    inv_label_map[idx] = label

In [13]:
train_df_aug = pd.read_csv("/kaggle/input/onecity/train_df_processed_1206_aug_5_chinese.csv")

text_df = pd.DataFrame(train_df_aug['text'].apply(eval).to_list(), columns=[f'text{i}' for i in range(1, NUM_AUG+1)])
text_df = text_df.fillna("").astype(str)

for col in [f'text{i}' for i in range(1, NUM_AUG+1)]:
    text_df[col] = text_df[col].apply(lambda x: "" if x == 'ERROR' else x.lower())
    text_df[col] = text_df[col].apply(reserve_chinese)
    text_df[col] = text_df[col].apply(lambda x: x[:MAX_LEN-2])

text_df['filename'] = train_df_aug['filename']
text_df['label'] = train_df_aug['label']

text_df = text_df[text_df.text1 != '无访问权限']

In [16]:
df_text_counts = text_df['text1'].value_counts()

top_freq_texts = set(df_text_counts[df_text_counts > 200].index)

df_sub = text_df[text_df['text1'].apply(lambda x: x not in top_freq_texts)]
print(df_sub.shape)

for text in list(top_freq_texts):
    df_sub2 = text_df[text_df['text1'] == text].head(20)
    df_sub = pd.concat([df_sub, df_sub2])
print(df_sub.shape)

train_df = df_sub.reset_index(drop=True)
train_df = train_df.sample(frac=1., random_state=2020)
train_df = train_df.reset_index(drop=True)

(23821, 10)
(24481, 10)


In [17]:
if RUN_TEST:
    test_df = pd.read_csv("/kaggle/input/onecity/rest_df_content_only_1206_chinese.csv")

In [18]:
%%time
x_train = []
for idx in range(1, NUM_AUG+1):
    col = rf"text{idx}"
    print(f"Encode Train: Part {idx}...")
    train_aug_encoded = tokenizer.batch_encode_plus(
        train_df[col].values,
        pad_to_max_length=True,
        max_length=MAX_LEN
    )
    x_train.append(np.array(train_aug_encoded['input_ids']))
y = train_df['label'].map(label_map).values

Encode Train: Part 1...
Encode Train: Part 2...
Encode Train: Part 3...
Encode Train: Part 4...
Encode Train: Part 5...
Encode Train: Part 6...
Encode Train: Part 7...
Encode Train: Part 8...
CPU times: user 7min 42s, sys: 853 ms, total: 7min 43s
Wall time: 7min 44s


In [20]:
test_df = test_df.fillna("").astype(str)

for col in [f'text{i}' for i in range(1, NUM_AUG+1)]:
    test_df[col] = test_df[col].apply(lambda x: "" if x == 'ERROR' else x.lower())
    test_df[col] = test_df[col].apply(lambda x: x[:MAX_LEN-2])

In [21]:
%%time
if RUN_TEST:
    test_datasets = []
    for idx in range(1, NUM_AUG+1):
        col = rf"text{idx}"
        print(f"Encode Test: Part {idx}...")
        test_aug_encoded = tokenizer.batch_encode_plus(
            test_df[col].values,
            pad_to_max_length=True,
            max_length=MAX_LEN
        )
        x_test = np.array(test_aug_encoded['input_ids'])
        test_dataset = (
            tf.data.Dataset
            .from_tensor_slices(x_test)
            .batch(BATCH_SIZE)
        )
        test_datasets.append(test_dataset)

Encode Test: Part 1...
Encode Test: Part 2...
Encode Test: Part 3...
Encode Test: Part 4...
Encode Test: Part 5...
Encode Test: Part 6...
Encode Test: Part 7...
Encode Test: Part 8...
CPU times: user 2min 24s, sys: 242 ms, total: 2min 24s
Wall time: 2min 25s


In [22]:
len(x_train[0]), len(y)

(24481, 24481)

In [23]:
def build_model(model_name, max_len):
    # First load the transformer layer
    if MODEL_NAME == 'bert-base-chinese':
        transformer_encoder = TFAutoModel.from_pretrained(model_name)
    else:
        transformer_encoder = TFBertModel.from_pretrained(model_name, from_pt=True)

    # This will be the input tokens 
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")

    # Now, we encode the text using the transformers we just loaded
    sequence_output = transformer_encoder(input_ids)[0]

    # Only extract the token used for classification, which is <s>
    cls_token = sequence_output[:, 0, :]

    # Finally, pass it through a 3-way softmax, since there's 3 possible laels
    out = Dense(20, activation='softmax')(cls_token)

    # It's time to build and compile the model
    model = Model(inputs=input_ids, outputs=out)
    model.compile(
        Adam(lr=3e-5),
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

In [24]:
kfold = StratifiedKFold(n_splits=N_FOLDS)

In [25]:
train_df['y_pred'] = ""
train_df['proba'] = 0.0

In [26]:
all_accs = []
test_pred_results = []
for ii, (tr, tt) in enumerate(kfold.split(X=y, y=y)):
    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    elif len(gpus) > 1: # multiple GPUs in one VM
        strategy = tf.distribute.MirroredStrategy(gpus)
    else: # default strategy that works on CPU and single GPU
        strategy = tf.distribute.get_strategy()
    
    # Prepare KFold data
    y_train, y_valid = y[tr], y[tt]

    x_train_combined = np.concatenate([x[tr] for x in x_train])
    y_train_combined = np.concatenate([y_train] * len(x_train))
    
    # Shuffle augmented data
    idxs = np.arange(len(y_train_combined))
    idxs = shuffle(idxs, random_state=2020)
    x_train_combined = x_train_combined[idxs]
    y_train_combined = y_train_combined[idxs]
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train_combined, y_train_combined))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    valid_datasets = []
    for x in x_train:        
        valid_dataset = (
            tf.data.Dataset
            .from_tensor_slices((x[tt], y_valid))
            .batch(BATCH_SIZE)
            .cache()
            .prefetch(AUTO)
        )
        valid_datasets.append(valid_dataset)

    with strategy.scope():
        model = build_model(MODEL_NAME, MAX_LEN)

    n_steps = len(x_train_combined) // BATCH_SIZE
    train_history = model.fit(
        train_dataset,
        steps_per_epoch=n_steps,
        validation_data=valid_datasets[0],
        epochs=EPOCHS
    )
    
    # Predict on validation set
    valid_aug_probs = [model.predict(valid_dataset, verbose=1) for valid_dataset in valid_datasets]
    valid_probs = np.mean(valid_aug_probs, axis=0)
    y_valid_preds = np.argmax(valid_probs, axis=1)
    acc = accuracy_score(y_valid, y_valid_preds)
    print(f"Accuracy for KFold {ii}: {acc}")
    all_accs.append(acc)
    
    train_df.loc[tt, 'y_pred'] = np.vectorize(inv_label_map.get)(y_valid_preds)
    train_df.loc[tt, 'proba'] = valid_probs.max(axis=1)
    
    if RUN_TEST:
        # Prediction on test set
        test_aug_probs = [model.predict(test_dataset, verbose=1) for test_dataset in test_datasets]
        test_probs = np.mean(test_aug_probs, axis=0)
        test_pred_results.append(test_probs)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=478309336.0, style=ProgressStyle(descri…


Epoch 1/2
Epoch 2/2
Accuracy for KFold 0: 0.8603511637403022
Epoch 1/2
Epoch 2/2
Accuracy for KFold 1: 0.8660130718954249
Epoch 1/2
Epoch 2/2
Accuracy for KFold 2: 0.8496732026143791
Epoch 1/2
Epoch 2/2
Accuracy for KFold 3: 0.8594771241830066
Epoch 1/2
Epoch 2/2
Accuracy for KFold 4: 0.8517156862745098
Epoch 1/2
Epoch 2/2
Accuracy for KFold 5: 0.8537581699346405
Epoch 1/2
Epoch 2/2
Accuracy for KFold 6: 0.8627450980392157
Epoch 1/2
Epoch 2/2
Accuracy for KFold 7: 0.8570261437908496
Epoch 1/2
Epoch 2/2
Accuracy for KFold 8: 0.8643790849673203
Epoch 1/2
Epoch 2/2
Accuracy for KFold 9: 0.8594771241830066


In [27]:
# test_aug_encoded = tokenizer.batch_encode_plus(
# #     ['工作单位新办序号广饶县环卫处延华文男华泰集团有限公司杜滨男华泰集团有限公司倪鹤女广饶县丰源纺织有限公司燕荣凤女原广饶县供销贸易公司宋福志男广饶县山水水泥有限公司王光诚男花官镇洛程幼儿园王芬女科达集团高孟海男广饶科力达石化科技有限公司田美岗男山东华星石油化工集团有限公司谢文杰男华泰集团有限公司傅建武男原服装厂崔向亮男'],
# #     ['出生年月性别青岛市市北区台东三路号单元户刘宽男青岛市市北区顺兴路号户臧丽娜女青岛市市北区东光路号单元宋降龙男青岛市市北区华阳路号大成公馆号楼户王新男青岛市埕口一路三单元户王英光男青岛市市北区瑞海北路号号楼户瑞海馨园肖中权男青岛市市北区台东八路号户周嵩智男山东省青岛市市北区东仲小区号单元户李龙男青岛市市北区长春路东兴市场号号楼单元乔安钢男青岛市市北区台东六路号户鲍习平男山东省莱阳市穴坊镇西富山村孙辉女黄岛路号户蒲英玲女青岛市市北区康宁路号北舍号楼单元户张瑛女青岛市徐州路号号楼单元室苏兆楷男青岛市市北区台东三路号单元户关永斌男无棣县金羚华府号楼一单元室程兵男青岛市市北区标山路号户王玉台男青岛市市北区威海路号户段京兵男山东省青岛市市北区台东七路号楼户梁尧庆男青岛市市北区芙蓉路号号楼单元户连红男通化路号单元户姜腾飞男'],
#     ['出生年月民族参加工作时间性别汉大本男张宪印党组成员副局长山东政法干部管理学院汉大专男高峰科员河北燕山大学汉大本女张双双主任山东省委党校汉大本男陈尚平党组副书记局长莱阳农学院汉硕士男胡金庆科员中国海洋大学汉大本男刘勇党组书记副局长山东工业大学汉大本女段琪琪科员长江大学汉初中男顾兆强科员北长山联中汉大专男高峰主任河北燕山大学汉初中男董仁科员蓬莱市大季家中学汉大本男李强副主任山东省委党校汉中专男娄兆军科员烟台水校汉大专男刘玉伟党组成员副局长烟台师范学院汉初中男于庆科员龙口市大王中学汉大本男刘勇党组书记副局长山东工业大学汉大本男李强科员山东省委党校汉大本女隋婷婷科员山东农业大学汉大本女刘宗云科员山东科技大学汉中专男娄兆军科员烟台水校汉大本女刘宗云科员山东科技大学汉大本男吴忠进科员山东函授大学汉初中男董仁科员蓬莱市大季家中学汉初中男于庆副主任龙口市大王中学汉硕士男胡金庆科员中国海洋大学汉大本女张双双主任山东省委党校汉大本女于咏文科员山东省委党校汉大本女隋婷婷科员山东农业大学汉大本男霍延虎科员山东理工大学汉大本女魏童童科员济宁学院汉大本女段琪琪科员长江大学汉初中男顾兆强科员北长山联中汉大本男王海亮党组成员山东省委党校汉大专男刘玉伟党组成员副局长烟台师范学院汉大本女于咏文科员山东省委党校汉大本女王美丁副主任聊城大学汉大本女乔婕科员鲁东大学汉大本男王黎明副主任长岛县委党校汉大本女乔婕科员鲁东大学汉大本男霍延虎科员山东理工大学汉大本女王美丁副主任聊城大学汉大本男王海亮党组成员山东省委党校汉大本男张宪印党组成员副局长山东政法干部管理学院汉大本男王黎明科员长岛县委党校汉大本女魏童童科员济宁学院汉大本男吴忠进科员山东函授大学汉大本男陈尚平党组副书记局长莱阳农学院'],
#     pad_to_max_length=True,
#     max_length=MAX_LEN
# )
# x_test = np.array(test_aug_encoded['input_ids'])
# test_dataset = (
#     tf.data.Dataset
#     .from_tensor_slices(x_test)
#     .batch(BATCH_SIZE)
# )
# probs = model.predict(test_dataset, verbose=1)

In [28]:
# plt.plot(probs[0])

In [29]:
# inv_label_map[np.argmax(probs[0])]

In [30]:
# for probs in valid_aug_probs:
#     _pred = np.argmax(probs, axis=1)
#     print(accuracy_score(y_valid, _pred))

In [31]:
# for idx in range(2, len(valid_aug_probs)):
#     _probs = np.mean(valid_aug_probs[:idx], axis=0)
#     _pred = np.argmax(_probs, axis=1)
#     print(accuracy_score(y_valid, _pred))

In [32]:
# for idx in range(2, len(valid_aug_probs)):
#     _probs = np.mean(valid_aug_probs[-idx:], axis=0)
#     _pred = np.argmax(_probs, axis=1)
#     print(accuracy_score(y_valid, _pred))

In [33]:
# np.mean(valid_aug_probs[:3], axis=0)

In [34]:
print(all_accs)
print(np.mean(all_accs))

[0.8603511637403022, 0.8660130718954249, 0.8496732026143791, 0.8594771241830066, 0.8517156862745098, 0.8537581699346405, 0.8627450980392157, 0.8570261437908496, 0.8643790849673203, 0.8594771241830066]
0.8584615869622656


In [35]:
test_preds = np.argmax(np.sum(test_pred_results, axis=0), axis=-1)
test_df['label'] = np.vectorize(inv_label_map.get)(test_preds)
test_df[['filename', 'label']].to_csv("content_only_prediction.csv", index=False, encoding='utf-8')

In [36]:
with open("test_probs.pkl", 'wb') as f:
    pickle.dump(test_pred_results, f)

In [37]:
train_df.to_csv("train_error_analysis.csv", index=False)

In [38]:
print(f"Total Running Time: {time() - start_time:.3f} seconds")

Total Running Time: 6360.778 seconds
