In [1]:
from time import time
start_time = time()

In [2]:
SMALL_TEST = False
N_FOLDS = 5
WITH_CONTENT = True

In [3]:
import tensorflow as tf
import transformers

print(tf.__version__)
print(transformers.__version__)



2.2.0
2.11.0


In [4]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
elif len(gpus) > 1: # multiple GPUs in one VM
    strategy = tf.distribute.MirroredStrategy(gpus)
else: # default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  ['10.0.0.2:8470']
REPLICAS:  8


In [5]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 5
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
MAX_LEN = 64

MODEL_NAME = 'bert-base-chinese'

In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re
import os
from tqdm.notebook import tqdm

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# TENSORFLOW
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D, GlobalMaxPooling1D

# HUGGINGFACE
from tokenizers import BertWordPieceTokenizer
from transformers import TFAutoModel, AutoTokenizer, TFBertModel
from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
from transformers import AdamWeightDecay

In [7]:
# First load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [8]:
import pickle
with open("/kaggle/input/onecity/add_tokens_1201.pkl", 'rb') as f:
    add_tokens = pickle.load(f)

In [9]:
tokenizer.add_tokens(add_tokens)

373

In [10]:
labels = ['文化休闲', '医疗卫生', '教育科技', '城乡建设', '工业', '交通运输', '生态环境', '经济管理',
       '政法监察', '农业畜牧业', '文秘行政', '劳动人事', '信息产业', '民政社区', '旅游服务', '商业贸易',
       '气象水文测绘地震地理', '资源能源', '财税金融', '外交外事']

label_map, inv_label_map = {}, {}
for idx, label in enumerate(labels):
    label_map[label] = idx
    inv_label_map[idx] = label

In [11]:
PUNCT_SET = set("#《》【】[]")
def is_chinese(uchar: str) -> bool:
    if uchar in PUNCT_SET:
        return True
    if uchar >= '\u4e00' and uchar <= '\u9fa5':
        return True
    else:
        return False

def reserve_chinese(content: str, threshold: int = 512) -> str:
    content_str = ''
    c = 0
    for i in content:
        if c == threshold:
            break
        if is_chinese(i):
            content_str += i
            c += 1
    return content_str

In [12]:
train_df = pd.read_csv("/kaggle/input/onecity/train_df_processed_1204_punctuations_aug.csv")
train_df['text'] = train_df['text'].apply(eval).apply(lambda x: x[0])
train_df['filename_chinese'] = train_df.filename.apply(reserve_chinese)
train_df['text'] = train_df['filename_chinese'] + train_df['text']

In [13]:
train_df['text'] = train_df['text'].apply(lambda x: str(x)[:MAX_LEN - 2])

In [14]:
%%time
train_encoded = tokenizer.batch_encode_plus(
    train_df.text.values,
    pad_to_max_length=True,
    max_length=MAX_LEN
)

CPU times: user 1min 34s, sys: 185 ms, total: 1min 34s
Wall time: 1min 34s


In [15]:
x = np.array(train_encoded['input_ids'])
y = train_df['label'].map(label_map).values

In [16]:
len(x), len(y)

(60000, 60000)

In [17]:
def build_model(model_name, max_len):
    # First load the transformer layer
    if MODEL_NAME == 'bert-base-chinese':
        transformer_encoder = TFAutoModel.from_pretrained(model_name)
    else:
        transformer_encoder = TFBertModel.from_pretrained(model_name, from_pt=True)

    # This will be the input tokens 
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")

    # Now, we encode the text using the transformers we just loaded
    sequence_output = transformer_encoder(input_ids)[0]

    # Only extract the token used for classification, which is <s>
    cls_token = sequence_output[:, 0, :]

    # Finally, pass it through a 3-way softmax, since there's 3 possible laels
    out = Dense(20, activation='softmax')(cls_token)

    # It's time to build and compile the model
    model = Model(inputs=input_ids, outputs=out)
    model.compile(
        Adam(lr=3e-5),
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

In [18]:
test_df = pd.read_csv("/kaggle/input/onecity/rest_df_with_title.csv")
test_df["text"] = test_df["text"].apply(lambda x: str(x)[:MAX_LEN-2])

In [19]:
%%time
test_encoded = tokenizer.batch_encode_plus(
    test_df["text"].values,
    pad_to_max_length=True,
    max_length=MAX_LEN
)
x_test = np.array(test_encoded['input_ids'])
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

CPU times: user 371 ms, sys: 2.03 ms, total: 373 ms
Wall time: 371 ms


In [20]:
kfold = StratifiedKFold(n_splits=N_FOLDS)

train_df['y_pred'] = ""
train_df['proba'] = 0.0

In [21]:
test_pred_results = []
all_accs = []
for ii, (tr, tt) in enumerate(kfold.split(X=y, y=y)):
    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    elif len(gpus) > 1: # multiple GPUs in one VM
        strategy = tf.distribute.MirroredStrategy(gpus)
    else: # default strategy that works on CPU and single GPU
        strategy = tf.distribute.get_strategy()
    
    # Prepare KFold data
    y_train, y_valid = y[tr], y[tt]
    
    x_train = x[tr]
    x_valid = x[tt]

    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_valid, y_valid))
        .batch(BATCH_SIZE)
        .cache()
        .prefetch(AUTO)
    )

    with strategy.scope():
        model = build_model(MODEL_NAME, MAX_LEN)

    n_steps = len(x_train) // BATCH_SIZE
    train_history = model.fit(
        train_dataset,
        steps_per_epoch=n_steps,
        validation_data=valid_dataset,
        epochs=EPOCHS
    )
    acc = train_history.history['val_accuracy'][-1]
    all_accs.append(acc)
    
    # Prediction on test set
    test_probs = model.predict(test_dataset, verbose=1)
    test_pred_results.append(test_probs)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=478309336.0, style=ProgressStyle(descri…


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
print(all_accs)
print(np.mean(all_accs))

[0.9796666502952576, 0.9764166474342346, 0.9758332967758179, 0.9766666293144226, 0.9782499670982361]
0.9773666381835937


In [23]:
test_preds = np.argmax(np.sum(test_pred_results, axis=0), axis=-1)
test_df['label'] = np.vectorize(inv_label_map.get)(test_preds)
test_df[['filename', 'label']].to_csv("title_223_prediction.csv", index=False, encoding='utf-8')

In [24]:
print(f"Total Running Time: {time() - start_time:.3f} seconds")

Total Running Time: 1448.462 seconds
