In [1]:
# Update to transformers 2.8.0
!pip install -q transformers --upgrade
!pip show transformers

Name: transformers
Version: 2.8.0
Summary: State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
Home-page: https://github.com/huggingface/transformers
Author: Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors
Author-email: thomas@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.6/site-packages
Requires: sacremoses, sentencepiece, dataclasses, numpy, tokenizers, requests, tqdm, filelock, boto3, regex
Required-by: 


In [2]:
import os
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, average_precision_score, roc_auc_score
import matplotlib.pyplot as plt
import transformers as trfm
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

In [3]:
tqdm.pandas()

## Helper functions

In [4]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512, enable_padding=False):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    
    ---
    
    Inputs:
        tokenizer: the `fast_tokenizer` that we imported from the tokenizers library
    """
    tokenizer.enable_truncation(max_length=maxlen)
    if enable_padding:
        tokenizer.enable_padding(max_length=maxlen)
    
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [5]:
def combine_qa_ids(q_ids, a_ids, tokenizer, maxlen=512):
    """
    Given two arrays of IDs (questions and answers) created by
    `fast_encode`, we combine and pad them.
    Inputs:
        tokenizer: The original tokenizer (not the fast_tokenizer)
    """
    combined_ids = []

    for i in tqdm(range(q_ids.shape[0])):
        ids = []
        ids.append(tokenizer.cls_token_id)
        ids.extend(q_ids[i])
        ids.append(tokenizer.sep_token_id)
        ids.extend(a_ids[i])
        ids.append(tokenizer.sep_token_id)
        ids.extend([tokenizer.pad_token_id] * (maxlen - len(ids)))

        combined_ids.append(ids)
    
    return np.array(combined_ids)

In [6]:
def encode_qa(questions, answers, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(questions), chunk_size)):
        q_chunk = questions[i:i+chunk_size].tolist()
        a_chunk = answers[i:i+chunk_size].tolist()
        text_chunk = list(zip(q_chunk, a_chunk))
        
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [7]:
def truncate_text(text, tokenizer, chunk_size=256, maxlen=256):
    """
    Ensure that the text does not have more than maxlen tokens
    """
    tokenizer.enable_truncation(max_length=maxlen)
    all_norm_str = []
    
    for i in tqdm(range(0, len(text), chunk_size)):
        chunk = text[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(chunk)
        all_norm_str.extend([str(enc.normalized_str) for enc in encs])
    
    return all_norm_str

In [8]:
def build_model(transformer, max_len=None):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_ids = L.Input(shape=(max_len, ), dtype=tf.int32)
    
    x = transformer(input_ids)[0]
    x = x[:, 0, :]
    x = L.Dense(1, activation='sigmoid', name='sigmoid')(x)
    
    # BUILD AND COMPILE MODEL
    model = Model(inputs=input_ids, outputs=x)
    model.compile(
        loss='binary_crossentropy', 
        metrics=['accuracy'], 
        optimizer=Adam(lr=1e-5)
    )
    
    return model

In [9]:
def save_model(model, sigmoid_dir='transformer', transformer_dir='transformer'):
    """
    Special function to load a keras model that uses a transformer layer
    """
    os.makedirs(transformer_dir, exist_ok=True)
    os.makedirs(sigmoid_dir, exist_ok=True)
    
    transformer = model.layers[1]
    transformer.save_pretrained(transformer_dir)
    
    sigmoid_path = os.path.join(sigmoid_dir,'sigmoid.pickle')
    sigmoid = model.get_layer('sigmoid').get_weights()
    pickle.dump(sigmoid, open(sigmoid_path, 'wb'))

    
def load_model(sigmoid_dir='transformer', transformer_dir='transformer', 
               architecture="electra", max_len=None):
    """
    Special function to load a keras model that uses a transformer layer
    """
    sigmoid_path = os.path.join(sigmoid_dir,'sigmoid.pickle')
    
    if architecture == 'electra':
        transformer = trfm.TFElectraModel.from_pretrained(transformer_dir)
    else:
        transformer = trfm.TFAutoModel.from_pretrained(transformer_dir)
    model = build_model(transformer, max_len=max_len)
    
    sigmoid = pickle.load(open(sigmoid_path, 'rb'))
    model.get_layer('sigmoid').set_weights(sigmoid)
    
    return model

## TPU Configs

In [10]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [11]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 8
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 512
MODEL = 'distilbert-base-multilingual-cased'

## Convert text to matrices

Caveat: Since a lot of the questions on stackexchange goes over 256, characters, we end up truncating a large part (if not all) of the answers. Thus, we need to "pre" truncate them by separately encode the questions and answers, and use a functions to combine them again.

Note: Here we are not actually encoding it, instead we load the encoded q&a pairs from another notebook, in order to limit memory consumption.

In [12]:
correct_ids = np.load('/kaggle/input/encode-stackexchange-for-mdistilbert/correct_ids.npy')
wrong_ids = np.load('/kaggle/input/encode-stackexchange-for-mdistilbert/wrong_ids.npy')

In [13]:
input_ids = np.concatenate([correct_ids, wrong_ids])

labels = np.concatenate([
    np.ones(correct_ids.shape[0]),
    np.zeros(wrong_ids.shape[0])
]).astype(np.int32)

## Train test split

In [14]:
train_idx, test_idx = train_test_split(
    np.arange(input_ids.shape[0]), 
    test_size=0.3, 
    random_state=0
)

valid_idx, test_idx = train_test_split(
    test_idx, 
    test_size=0.5, 
    random_state=1
)

In [15]:
train_ids = input_ids[train_idx]
valid_ids = input_ids[valid_idx]
test_ids = input_ids[test_idx]

train_labels = labels[train_idx]
valid_labels = labels[valid_idx]
test_labels = labels[test_idx]

## Build datasets objects

In [16]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_ids, train_labels))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((valid_ids, valid_labels))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_ids)
    .batch(BATCH_SIZE)
)

## Modeling

In [17]:
%%time
with strategy.scope():
    transformer_layer = trfm.TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=618.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 512)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 512, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
CPU times: user 31.7 s, sys: 10 s, total: 41.8 s
Wall time: 51.3 s


### Train model

In [18]:
n_steps = train_labels.shape[0] // BATCH_SIZE

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Train for 3209 steps, validate for 688 steps
Epoch 1/8
Epoch 2/8
Epoch 4/8
Epoch 5/8
Epoch 6/8


In [19]:
save_model(model)

In [20]:
hist_df = pd.DataFrame(train_history.history)
hist_df.to_csv('train_history.csv')
hist_df

Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.359914,0.842623,0.302517,0.878112
1,0.289072,0.883197,0.295759,0.882781
2,0.263132,0.894148,0.293478,0.88537
3,0.240778,0.903538,0.305684,0.884564
4,0.218949,0.91284,0.320583,0.882781
5,0.197384,0.921941,0.345204,0.878703
6,0.176445,0.930593,0.372406,0.880611
7,0.156003,0.939221,0.393461,0.878612


## Eval

In [21]:
with strategy.scope():
    model = load_model(max_len=MAX_LEN, architecture='xlm-roberta')

In [22]:
y_score = model.predict(test_dataset, verbose=1).squeeze()
y_pred = y_score.round().astype(int)
print("AP:", average_precision_score(test_labels, y_score))
print("ROC AUC:", roc_auc_score(test_labels, y_score))
print(classification_report(test_labels, y_pred))

AP: 0.9170765379994744
ROC AUC: 0.9382573931497918
              precision    recall  f1-score   support

           0       0.91      0.84      0.88     44307
           1       0.85      0.91      0.88     43734

    accuracy                           0.88     88041
   macro avg       0.88      0.88      0.88     88041
weighted avg       0.88      0.88      0.88     88041

