In [1]:
# Update to transformers 2.8.0
!pip install -q transformers --upgrade
!pip show transformers
!pip install -q pandas --upgrade

Name: transformers
Version: 2.8.0
Summary: State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
Home-page: https://github.com/huggingface/transformers
Author: Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors
Author-email: thomas@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.6/site-packages
Requires: filelock, sentencepiece, requests, boto3, regex, tokenizers, numpy, dataclasses, sacremoses, tqdm
Required-by: 
[31mERROR: pandas-profiling 2.5.0 has requirement pandas==0.25.3, but you'll have pandas 1.0.3 which is incompatible.[0m
[31mERROR: hypertools 0.6.2 has requirement scikit-learn<0.22,>=0.19.1, but you'll have scikit-learn 0.22.2.post1 which is incompatible.[0m


In [2]:
import os
import pickle
import json

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, average_precision_score, roc_auc_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
import transformers as trfm
from transformers import AutoTokenizer, TFAutoModel, TFElectraModel, ElectraTokenizer
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

## Helper functions

In [3]:
def build_reranker(tokenizer, model):
    tokenizer.enable_padding()
    
    def rerank(question, answers):
        pairs = list(zip([question] * len(answers), answers))

        encs = tokenizer.encode_batch(pairs)
        input_ids = np.array([enc.ids for enc in encs])
        scores = model.predict(input_ids).squeeze()

        return scores
    
    return rerank

In [4]:
def touch_dir(dirname):
    if not os.path.exists(dirname):
        os.makedirs(dirname)
        print(f"Created directory {dirname}.")
    else:
        print(f"Directory {dirname} already exists.")

In [5]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512, enable_padding=False):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    
    ---
    Inputs:
        tokenizer: the `fast_tokenizer` that we imported from the tokenizers library
    """
    tokenizer.enable_truncation(max_length=maxlen)
    if enable_padding:
        tokenizer.enable_padding(max_length=maxlen)
    
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [6]:
def combine_qa_ids(q_ids, a_ids, tokenizer, maxlen=512):
    """
    Given two arrays of IDs (questions and answers) created by
    `fast_encode`, we combine and pad them.
    Inputs:
        tokenizer: The original tokenizer (not the fast_tokenizer)
    """
    combined_ids = []

    for i in tqdm(range(q_ids.shape[0])):
        ids = []
        ids.append(tokenizer.cls_token_id)
        ids.extend(q_ids[i])
        ids.append(tokenizer.sep_token_id)
        ids.extend(a_ids[i])
        ids.append(tokenizer.sep_token_id)
        ids.extend([tokenizer.pad_token_id] * (maxlen - len(ids)))

        combined_ids.append(ids)
    
    return np.array(combined_ids)

In [7]:
def encode_qa(questions, answers, tokenizer, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(questions))):
        q = questions[i]
        a = answers[i]
        
        encs = tokenizer.encode(q, a)
        all_ids.append(encs.ids)
        if len(encs.ids) > 512:
            return q, a
    
    return np.array(all_ids)

In [8]:
def build_model(transformer, max_len=None):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_ids = L.Input(shape=(max_len, ), dtype=tf.int32)
    
    x = transformer(input_ids)[0]
    x = x[:, 0, :]
    x = L.Dense(1, activation='sigmoid', name='sigmoid')(x)
    
    # BUILD AND COMPILE MODEL
    model = Model(inputs=input_ids, outputs=x)
    model.compile(
        loss='binary_crossentropy', 
        metrics=['accuracy'], 
        optimizer=Adam(lr=1e-5)
    )
    
    return model

In [9]:
def load_model(sigmoid_dir, transformer_dir='transformer', architecture="electra", max_len=None):
    """
    Special function to load a keras model that uses a transformer layer
    """
    sigmoid_path = os.path.join(sigmoid_dir,'sigmoid.pickle')
    
    if architecture == 'electra':
        transformer = TFElectraModel.from_pretrained(transformer_dir)
    else:
        transformer = TFAutoModel.from_pretrained(transformer_dir)
    model = build_model(transformer, max_len=max_len)
    
    sigmoid = pickle.load(open(sigmoid_path, 'rb'))
    model.get_layer('sigmoid').set_weights(sigmoid)
    
    return model

In [10]:
tokenizer = trfm.ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
fast_tokenizer = BertWordPieceTokenizer('/kaggle/input/healthtap-joint-electra-small/vocab.txt', lowercase=True, add_special_tokens=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




## Load Models

In [11]:
models = {}

In [12]:
models['electra_ht_small'] = load_model(
    sigmoid_dir='/kaggle/input/healthtap-joint-electra-small',
    transformer_dir='/kaggle/input/healthtap-joint-electra-small/transformer',
    architecture='electra',
    max_len=None
)

models['electra_ht_small'].summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_electra_model (TFElectraM ((None, None, 256),)      13483008  
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 256)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 257       
Total params: 13,483,265
Trainable params: 13,483,265
Non-trainable params: 0
_________________________________________________________________


In [13]:
models['electra_ht_base'] = load_model(
    sigmoid_dir='/kaggle/input/healthtap-joint-electra-base',
    transformer_dir='/kaggle/input/healthtap-joint-electra-base/transformer',
    architecture='electra',
    max_len=None
)

models['electra_ht_base'].summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_electra_model_1 (TFElectr ((None, None, 768),)      108891648 
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 769       
Total params: 108,892,417
Trainable params: 108,892,417
Non-trainable params: 0
_________________________________________________________________


In [14]:
models['electra_se_small'] = load_model(
    sigmoid_dir='/kaggle/input/stackexchange-finetune-electra-small/transformer',
    transformer_dir='/kaggle/input/stackexchange-finetune-electra-small/transformer',
    architecture='electra',
    max_len=None
)

models['electra_se_small'].summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_electra_model_2 (TFElectr ((None, None, 256),)      13483008  
_________________________________________________________________
tf_op_layer_strided_slice_2  [(None, 256)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 257       
Total params: 13,483,265
Trainable params: 13,483,265
Non-trainable params: 0
_________________________________________________________________


In [15]:
models['electra_se_base'] = load_model(
    sigmoid_dir='/kaggle/input/stackexchange-finetune-electra-base/transformer',
    transformer_dir='/kaggle/input/stackexchange-finetune-electra-base/transformer',
    architecture='electra',
    max_len=None
)

models['electra_se_base'].summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_electra_model_3 (TFElectr ((None, None, 768),)      108891648 
_________________________________________________________________
tf_op_layer_strided_slice_3  [(None, 768)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 769       
Total params: 108,892,417
Trainable params: 108,892,417
Non-trainable params: 0
_________________________________________________________________


## Load Data

In [16]:
MAX_LEN = 512

df = pd.concat([
    pd.read_csv(f'/kaggle/input/stackexchange-qa-pairs/covid/{group}.csv')
    for group in ['biomedical', 'expert', 'general']
]).reset_index(drop=True)

questions = df.title + ' [SEP] ' + df.question

In [17]:
q_ids = fast_encode(questions.values, fast_tokenizer, maxlen=MAX_LEN//2 - 2)
a_ids = fast_encode(df.answer.values, fast_tokenizer, maxlen=MAX_LEN//2 - 2)
wa_ids = fast_encode(df.wrong_answer.values, fast_tokenizer, maxlen=MAX_LEN//2 - 2)

correct_ids = combine_qa_ids(q_ids, a_ids, tokenizer, maxlen=MAX_LEN)
wrong_ids = combine_qa_ids(q_ids, wa_ids, tokenizer, maxlen=MAX_LEN)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=642.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=642.0), HTML(value='')))




In [18]:
input_ids = np.concatenate([correct_ids, wrong_ids])

labels = np.concatenate([
    np.ones(correct_ids.shape[0]),
    np.zeros(correct_ids.shape[0])
]).astype(np.int32)

## Compute Scores

In [19]:
score_df = pd.concat([df[['site', 'group']]]*2)

for model_name, model in models.items():
    %time score_df[model_name] = model.predict(input_ids, batch_size=64)

CPU times: user 4.5 s, sys: 350 ms, total: 4.85 s
Wall time: 9.23 s
CPU times: user 6.45 s, sys: 1.5 s, total: 7.95 s
Wall time: 29 s
CPU times: user 4.88 s, sys: 380 ms, total: 5.26 s
Wall time: 9.61 s
CPU times: user 6.28 s, sys: 1.48 s, total: 7.76 s
Wall time: 28.8 s


In [20]:
score_df['labels'] = labels

In [21]:
score_df.to_csv("community.csv", index=False)

## Compute Prediction Results

### Overall

In [22]:
overall = {}

for model_name in models.keys():
    result = {}
    labels = score_df['labels']
    score = score_df[model_name]
    pred = score.round().astype(int)
    result['ap'] = average_precision_score(labels, score)
    result['roc_auc'] = roc_auc_score(labels, score)
    result['f1_score'] = f1_score(labels, pred)
    result['accuracy'] = accuracy_score(labels, pred)
    overall[model_name] = result

overall_df = pd.DataFrame(overall).round(4)
overall_df.to_csv("overall_results.csv")
overall_df

Unnamed: 0,electra_ht_small,electra_ht_base,electra_se_small,electra_se_base
ap,0.5609,0.6792,0.9429,0.9396
roc_auc,0.5898,0.7097,0.9559,0.9586
f1_score,0.6744,0.6817,0.8946,0.915
accuracy,0.5218,0.5374,0.891,0.912


In [23]:
print(overall_df.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  electra\_ht\_small &  electra\_ht\_base &  electra\_se\_small &  electra\_se\_base \\
\midrule
ap       &            0.5609 &           0.6792 &            0.9429 &           0.9396 \\
roc\_auc  &            0.5898 &           0.7097 &            0.9559 &           0.9586 \\
f1\_score &            0.6744 &           0.6817 &            0.8946 &           0.9150 \\
accuracy &            0.5218 &           0.5374 &            0.8910 &           0.9120 \\
\bottomrule
\end{tabular}



In [24]:
print(overall_df.to_markdown())

|          |   electra_ht_small |   electra_ht_base |   electra_se_small |   electra_se_base |
|:---------|-------------------:|------------------:|-------------------:|------------------:|
| ap       |             0.5609 |            0.6792 |             0.9429 |            0.9396 |
| roc_auc  |             0.5898 |            0.7097 |             0.9559 |            0.9586 |
| f1_score |             0.6744 |            0.6817 |             0.8946 |            0.915  |
| accuracy |             0.5218 |            0.5374 |             0.891  |            0.912  |


In [25]:
all_groups = {}

for group in df.group.unique():
    group_results = {}
    score_group_df = score_df[score_df.group == group]

    for model_name in models.keys():
        result = {}
        labels = score_group_df['labels']
        score = score_group_df[model_name]
        pred = score.round().astype(int)
        result['ap'] = average_precision_score(labels, score)
        result['roc_auc'] = roc_auc_score(labels, score)
        result['f1_score'] = f1_score(labels, pred)
        result['accuracy'] = accuracy_score(labels, pred)
        group_results[model_name] = result
    
    all_groups[group] = pd.DataFrame(group_results).round(4)

In [26]:
all_groups['biomedical']

Unnamed: 0,electra_ht_small,electra_ht_base,electra_se_small,electra_se_base
ap,0.5851,0.6902,0.9508,0.947
roc_auc,0.6189,0.7244,0.9679,0.9642
f1_score,0.6761,0.6806,0.9303,0.9254
accuracy,0.5306,0.5332,0.9286,0.9235


In [27]:
all_groups['general']

Unnamed: 0,electra_ht_small,electra_ht_base,electra_se_small,electra_se_base
ap,0.571,0.7097,0.9538,0.956
roc_auc,0.591,0.7276,0.962,0.9689
f1_score,0.6742,0.6843,0.8971,0.9258
accuracy,0.5183,0.5433,0.8933,0.9233


In [28]:
all_groups['expert']

Unnamed: 0,electra_ht_small,electra_ht_base,electra_se_small,electra_se_base
ap,0.5265,0.6233,0.8994,0.8858
roc_auc,0.5521,0.6584,0.9159,0.9237
f1_score,0.6729,0.6776,0.8421,0.8795
accuracy,0.5171,0.5308,0.8356,0.8733


In [29]:
print('biomedical')
print("-"*40)
print(all_groups['biomedical'].to_latex())
print("="*40)

print('general')
print("-"*40)
print(all_groups['general'].to_latex())
print("="*40)

print('expert')
print("-"*40)
print(all_groups['expert'].to_latex())
print("="*40)

biomedical
----------------------------------------
\begin{tabular}{lrrrr}
\toprule
{} &  electra\_ht\_small &  electra\_ht\_base &  electra\_se\_small &  electra\_se\_base \\
\midrule
ap       &            0.5851 &           0.6902 &            0.9508 &           0.9470 \\
roc\_auc  &            0.6189 &           0.7244 &            0.9679 &           0.9642 \\
f1\_score &            0.6761 &           0.6806 &            0.9303 &           0.9254 \\
accuracy &            0.5306 &           0.5332 &            0.9286 &           0.9235 \\
\bottomrule
\end{tabular}

general
----------------------------------------
\begin{tabular}{lrrrr}
\toprule
{} &  electra\_ht\_small &  electra\_ht\_base &  electra\_se\_small &  electra\_se\_base \\
\midrule
ap       &            0.5710 &           0.7097 &            0.9538 &           0.9560 \\
roc\_auc  &            0.5910 &           0.7276 &            0.9620 &           0.9689 \\
f1\_score &            0.6742 &           0.6843 &            0

In [30]:
print('biomedical')
print("-"*40)
print(all_groups['biomedical'].to_markdown())
print("="*40)

print('general')
print("-"*40)
print(all_groups['general'].to_markdown())
print("="*40)

print('expert')
print("-"*40)
print(all_groups['expert'].to_markdown())
print("="*40)

biomedical
----------------------------------------
|          |   electra_ht_small |   electra_ht_base |   electra_se_small |   electra_se_base |
|:---------|-------------------:|------------------:|-------------------:|------------------:|
| ap       |             0.5851 |            0.6902 |             0.9508 |            0.947  |
| roc_auc  |             0.6189 |            0.7244 |             0.9679 |            0.9642 |
| f1_score |             0.6761 |            0.6806 |             0.9303 |            0.9254 |
| accuracy |             0.5306 |            0.5332 |             0.9286 |            0.9235 |
general
----------------------------------------
|          |   electra_ht_small |   electra_ht_base |   electra_se_small |   electra_se_base |
|:---------|-------------------:|------------------:|-------------------:|------------------:|
| ap       |             0.571  |            0.7097 |             0.9538 |            0.956  |
| roc_auc  |             0.591  |           