In [1]:
# Update to transformers 2.8.0
!pip install -q transformers --upgrade
!pip install -q pandas --upgrade
!pip show transformers

[31mERROR: pandas-profiling 2.5.0 has requirement pandas==0.25.3, but you'll have pandas 1.0.3 which is incompatible.[0m
[31mERROR: hypertools 0.6.2 has requirement scikit-learn<0.22,>=0.19.1, but you'll have scikit-learn 0.22.2.post1 which is incompatible.[0m
Name: transformers
Version: 2.8.0
Summary: State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
Home-page: https://github.com/huggingface/transformers
Author: Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors
Author-email: thomas@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.6/site-packages
Requires: sentencepiece, requests, tokenizers, sacremoses, tqdm, numpy, regex, boto3, dataclasses, filelock
Required-by: 


In [2]:
import os
import pickle
import json

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, average_precision_score, roc_auc_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
import transformers as trfm
from transformers import AutoTokenizer, TFAutoModel, TFElectraModel, ElectraTokenizer
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

## Helper functions

In [3]:
def build_reranker(tokenizer, model):
    tokenizer.enable_padding()
    
    def rerank(question, answers):
        pairs = list(zip([question] * len(answers), answers))

        encs = tokenizer.encode_batch(pairs)
        input_ids = np.array([enc.ids for enc in encs])
        scores = model.predict(input_ids).squeeze()

        return scores
    
    return rerank

In [4]:
def touch_dir(dirname):
    if not os.path.exists(dirname):
        os.makedirs(dirname)
        print(f"Created directory {dirname}.")
    else:
        print(f"Directory {dirname} already exists.")

In [5]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512, enable_padding=False):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    
    ---
    Inputs:
        tokenizer: the `fast_tokenizer` that we imported from the tokenizers library
    """
    tokenizer.enable_truncation(max_length=maxlen)
    if enable_padding:
        tokenizer.enable_padding(max_length=maxlen)
    
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [6]:
def combine_qa_ids(q_ids, a_ids, tokenizer, maxlen=512):
    """
    Given two arrays of IDs (questions and answers) created by
    `fast_encode`, we combine and pad them.
    Inputs:
        tokenizer: The original tokenizer (not the fast_tokenizer)
    """
    combined_ids = []

    for i in tqdm(range(q_ids.shape[0])):
        ids = []
        ids.append(tokenizer.cls_token_id)
        ids.extend(q_ids[i])
        ids.append(tokenizer.sep_token_id)
        ids.extend(a_ids[i])
        ids.append(tokenizer.sep_token_id)
        ids.extend([tokenizer.pad_token_id] * (maxlen - len(ids)))

        combined_ids.append(ids)
    
    return np.array(combined_ids)

In [7]:
def encode_qa(questions, answers, tokenizer, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(questions))):
        q = questions[i]
        a = answers[i]
        
        encs = tokenizer.encode(q, a)
        all_ids.append(encs.ids)
        if len(encs.ids) > 512:
            return q, a
    
    return np.array(all_ids)

In [8]:
def build_model(transformer, max_len=None):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_ids = L.Input(shape=(max_len, ), dtype=tf.int32)
    
    x = transformer(input_ids)[0]
    x = x[:, 0, :]
    x = L.Dense(1, activation='sigmoid', name='sigmoid')(x)
    
    # BUILD AND COMPILE MODEL
    model = Model(inputs=input_ids, outputs=x)
    model.compile(
        loss='binary_crossentropy', 
        metrics=['accuracy'], 
        optimizer=Adam(lr=1e-5)
    )
    
    return model

In [9]:
def load_model(sigmoid_dir, transformer_dir='transformer', architecture="electra", max_len=None):
    """
    Special function to load a keras model that uses a transformer layer
    """
    sigmoid_path = os.path.join(sigmoid_dir,'sigmoid.pickle')
    
    if architecture == 'electra':
        transformer = TFElectraModel.from_pretrained(transformer_dir)
    else:
        transformer = TFAutoModel.from_pretrained(transformer_dir)
    model = build_model(transformer, max_len=max_len)
    
    sigmoid = pickle.load(open(sigmoid_path, 'rb'))
    model.get_layer('sigmoid').set_weights(sigmoid)
    
    return model

In [10]:
tokenizer = trfm.ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
fast_tokenizer = BertWordPieceTokenizer('/kaggle/input/healthtap-joint-electra-small/vocab.txt', lowercase=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




## Load Models

In [11]:
models = {}

In [12]:
models['electra_ht_small'] = load_model(
    sigmoid_dir='/kaggle/input/healthtap-joint-electra-small',
    transformer_dir='/kaggle/input/healthtap-joint-electra-small/transformer',
    architecture='electra',
    max_len=None
)

models['electra_ht_small'].summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_electra_model (TFElectraM ((None, None, 256),)      13483008  
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 256)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 257       
Total params: 13,483,265
Trainable params: 13,483,265
Non-trainable params: 0
_________________________________________________________________


In [13]:
models['electra_ht_base'] = load_model(
    sigmoid_dir='/kaggle/input/healthtap-joint-electra-base',
    transformer_dir='/kaggle/input/healthtap-joint-electra-base/transformer',
    architecture='electra',
    max_len=None
)

models['electra_ht_base'].summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_electra_model_1 (TFElectr ((None, None, 768),)      108891648 
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 769       
Total params: 108,892,417
Trainable params: 108,892,417
Non-trainable params: 0
_________________________________________________________________


In [14]:
models['electra_se_small'] = load_model(
    sigmoid_dir='/kaggle/input/stackexchange-finetune-electra-small/transformer',
    transformer_dir='/kaggle/input/stackexchange-finetune-electra-small/transformer',
    architecture='electra',
    max_len=None
)

models['electra_se_small'].summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_electra_model_2 (TFElectr ((None, None, 256),)      13483008  
_________________________________________________________________
tf_op_layer_strided_slice_2  [(None, 256)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 257       
Total params: 13,483,265
Trainable params: 13,483,265
Non-trainable params: 0
_________________________________________________________________


In [15]:
models['electra_se_base'] = load_model(
    sigmoid_dir='/kaggle/input/stackexchange-finetune-electra-base/transformer',
    transformer_dir='/kaggle/input/stackexchange-finetune-electra-base/transformer',
    architecture='electra',
    max_len=None
)

models['electra_se_base'].summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_electra_model_3 (TFElectr ((None, None, 768),)      108891648 
_________________________________________________________________
tf_op_layer_strided_slice_3  [(None, 768)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 769       
Total params: 108,892,417
Trainable params: 108,892,417
Non-trainable params: 0
_________________________________________________________________


## Load Data

In [16]:
MAX_LEN = 512

df = pd.read_csv('/kaggle/input/covidqa/news.csv')

In [17]:
correct_ids = encode_qa(df.question.values.astype(str), df.answer.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)
wrong_ids = encode_qa(df.question.values.astype(str), df.wrong_answer.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)

HBox(children=(FloatProgress(value=0.0, max=481.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=481.0), HTML(value='')))




In [18]:
input_ids = np.concatenate([correct_ids, wrong_ids])

labels = np.concatenate([
    np.ones(correct_ids.shape[0]),
    np.zeros(correct_ids.shape[0])
]).astype(np.int32)

## Compute Scores

In [19]:
score_df = pd.concat([df[['source']]]*2)

for model_name, model in models.items():
    %time score_df[model_name] = model.predict(input_ids, batch_size=64)

CPU times: user 3.95 s, sys: 275 ms, total: 4.23 s
Wall time: 7.55 s
CPU times: user 5.36 s, sys: 1.09 s, total: 6.46 s
Wall time: 22.4 s
CPU times: user 4.15 s, sys: 234 ms, total: 4.39 s
Wall time: 7.64 s
CPU times: user 5.47 s, sys: 1.05 s, total: 6.52 s
Wall time: 22.4 s


In [20]:
score_df['labels'] = labels

In [21]:
score_df.to_csv('news.csv', index=False)

## Compute Prediction Results

### Macro-Average

In [22]:
overall = {}

for model_name in models.keys():
    result = {}
    labels = score_df['labels']
    score = score_df[model_name]
    pred = score.round().astype(int)
    result['ap'] = average_precision_score(labels, score).round(4)
    result['roc_auc'] = roc_auc_score(labels, score).round(4)
    result['f1_score'] = f1_score(labels, pred).round(4)
    result['accuracy'] = accuracy_score(labels, pred).round(4)
    overall[model_name] = result

overall_df = pd.DataFrame(overall)
overall_df.to_csv("overall_results.csv")
overall_df

Unnamed: 0,electra_ht_small,electra_ht_base,electra_se_small,electra_se_base
ap,0.9038,0.9273,0.6691,0.7553
roc_auc,0.9186,0.9327,0.7164,0.8053
f1_score,0.8433,0.8527,0.7113,0.7762
accuracy,0.842,0.8524,0.659,0.7266


In [23]:
print(overall_df.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  electra\_ht\_small &  electra\_ht\_base &  electra\_se\_small &  electra\_se\_base \\
\midrule
ap       &            0.9038 &           0.9273 &            0.6691 &           0.7553 \\
roc\_auc  &            0.9186 &           0.9327 &            0.7164 &           0.8053 \\
f1\_score &            0.8433 &           0.8527 &            0.7113 &           0.7762 \\
accuracy &            0.8420 &           0.8524 &            0.6590 &           0.7266 \\
\bottomrule
\end{tabular}



In [24]:
print(overall_df.to_markdown())

|          |   electra_ht_small |   electra_ht_base |   electra_se_small |   electra_se_base |
|:---------|-------------------:|------------------:|-------------------:|------------------:|
| ap       |             0.9038 |            0.9273 |             0.6691 |            0.7553 |
| roc_auc  |             0.9186 |            0.9327 |             0.7164 |            0.8053 |
| f1_score |             0.8433 |            0.8527 |             0.7113 |            0.7762 |
| accuracy |             0.842  |            0.8524 |             0.659  |            0.7266 |


## By source

In [25]:
all_sources = {}

for source in df.source.unique():
    source_results = {}
    score_source_df = score_df[score_df.source == source]

    for model_name in models.keys():
        result = {}
        labels = score_source_df['labels']
        score = score_source_df[model_name]
        pred = score.round().astype(int)
        result['ap'] = average_precision_score(labels, score).round(4)
        result['roc_auc'] = roc_auc_score(labels, score).round(4)
        result['f1_score'] = f1_score(labels, pred).round(4)
        result['accuracy'] = accuracy_score(labels, pred).round(4)
        
        source_results[model_name] = result
    
    all_sources[source] = pd.DataFrame(source_results)

### Regular output

In [26]:
for source, sdf in all_sources.items():
    print(source)
    print('-'*40)
    print(sdf)
    print('='*40)

ABC Australia
----------------------------------------
          electra_ht_small  electra_ht_base  electra_se_small  electra_se_base
ap                  0.8968           0.8860            0.6931           0.7400
roc_auc             0.9062           0.8867            0.6914           0.7422
f1_score            0.8125           0.7333            0.7059           0.7442
accuracy            0.8125           0.7500            0.6875           0.6562
ABC News
----------------------------------------
          electra_ht_small  electra_ht_base  electra_se_small  electra_se_base
ap                  0.8825           0.9334            0.6492           0.6274
roc_auc             0.9259           0.9344            0.5995           0.6258
f1_score            0.8800           0.8649            0.6207           0.7200
accuracy            0.8750           0.8611            0.5417           0.6111
BBC
----------------------------------------
          electra_ht_small  electra_ht_base  electra_se_smal

### Latex output

In [27]:
for source, sdf in all_sources.items():
    print(source)
    print('-'*40)
    print(sdf.to_latex())
    print('='*40)

ABC Australia
----------------------------------------
\begin{tabular}{lrrrr}
\toprule
{} &  electra\_ht\_small &  electra\_ht\_base &  electra\_se\_small &  electra\_se\_base \\
\midrule
ap       &            0.8968 &           0.8860 &            0.6931 &           0.7400 \\
roc\_auc  &            0.9062 &           0.8867 &            0.6914 &           0.7422 \\
f1\_score &            0.8125 &           0.7333 &            0.7059 &           0.7442 \\
accuracy &            0.8125 &           0.7500 &            0.6875 &           0.6562 \\
\bottomrule
\end{tabular}

ABC News
----------------------------------------
\begin{tabular}{lrrrr}
\toprule
{} &  electra\_ht\_small &  electra\_ht\_base &  electra\_se\_small &  electra\_se\_base \\
\midrule
ap       &            0.8825 &           0.9334 &            0.6492 &           0.6274 \\
roc\_auc  &            0.9259 &           0.9344 &            0.5995 &           0.6258 \\
f1\_score &            0.8800 &           0.8649 &         

### Markdown output

In [28]:
for source, sdf in all_sources.items():
    print(source)
    print('-'*40)
    print(sdf.to_markdown())
    print('='*40)

ABC Australia
----------------------------------------
|          |   electra_ht_small |   electra_ht_base |   electra_se_small |   electra_se_base |
|:---------|-------------------:|------------------:|-------------------:|------------------:|
| ap       |             0.8968 |            0.886  |             0.6931 |            0.74   |
| roc_auc  |             0.9062 |            0.8867 |             0.6914 |            0.7422 |
| f1_score |             0.8125 |            0.7333 |             0.7059 |            0.7442 |
| accuracy |             0.8125 |            0.75   |             0.6875 |            0.6562 |
ABC News
----------------------------------------
|          |   electra_ht_small |   electra_ht_base |   electra_se_small |   electra_se_base |
|:---------|-------------------:|------------------:|-------------------:|------------------:|
| ap       |             0.8825 |            0.9334 |             0.6492 |            0.6274 |
| roc_auc  |             0.9259 |       

## AP Score by source

In [29]:
ap_df = pd.DataFrame({source: sdf.loc['ap'] for source, sdf in all_sources.items()}).T
ap_df

Unnamed: 0,electra_ht_small,electra_ht_base,electra_se_small,electra_se_base
ABC Australia,0.8968,0.886,0.6931,0.74
ABC News,0.8825,0.9334,0.6492,0.6274
BBC,0.8977,0.9259,0.7382,0.8679
CNN,0.9525,0.9436,0.7052,0.8598
CTV,0.8225,0.9339,0.7062,0.8579
Forbes,0.7534,0.8302,0.7077,0.7361
LA Times,0.875,0.95,0.7095,0.6458
NDTV,0.8675,0.8915,0.679,0.7449
NPR,0.972,0.9637,0.6752,0.8085
NY Times,0.9604,0.9455,0.6489,0.8077


In [30]:
print(ap_df.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  electra\_ht\_small &  electra\_ht\_base &  electra\_se\_small &  electra\_se\_base \\
\midrule
ABC Australia  &            0.8968 &           0.8860 &            0.6931 &           0.7400 \\
ABC News       &            0.8825 &           0.9334 &            0.6492 &           0.6274 \\
BBC            &            0.8977 &           0.9259 &            0.7382 &           0.8679 \\
CNN            &            0.9525 &           0.9436 &            0.7052 &           0.8598 \\
CTV            &            0.8225 &           0.9339 &            0.7062 &           0.8579 \\
Forbes         &            0.7534 &           0.8302 &            0.7077 &           0.7361 \\
LA Times       &            0.8750 &           0.9500 &            0.7095 &           0.6458 \\
NDTV           &            0.8675 &           0.8915 &            0.6790 &           0.7449 \\
NPR            &            0.9720 &           0.9637 &            0.6752 &           0.8085 \\
NY 

In [31]:
print(ap_df.to_markdown())

|                |   electra_ht_small |   electra_ht_base |   electra_se_small |   electra_se_base |
|:---------------|-------------------:|------------------:|-------------------:|------------------:|
| ABC Australia  |             0.8968 |            0.886  |             0.6931 |            0.74   |
| ABC News       |             0.8825 |            0.9334 |             0.6492 |            0.6274 |
| BBC            |             0.8977 |            0.9259 |             0.7382 |            0.8679 |
| CNN            |             0.9525 |            0.9436 |             0.7052 |            0.8598 |
| CTV            |             0.8225 |            0.9339 |             0.7062 |            0.8579 |
| Forbes         |             0.7534 |            0.8302 |             0.7077 |            0.7361 |
| LA Times       |             0.875  |            0.95   |             0.7095 |            0.6458 |
| NDTV           |             0.8675 |            0.8915 |             0.679  |           

## Micro Scores

In [32]:
micro_df = (sum(all_sources.values()) / len(all_sources)).round(4)
micro_df

Unnamed: 0,electra_ht_small,electra_ht_base,electra_se_small,electra_se_base
ap,0.8976,0.923,0.6986,0.7776
roc_auc,0.906,0.9232,0.698,0.7994
f1_score,0.8142,0.8336,0.706,0.7716
accuracy,0.8189,0.8365,0.6579,0.7099


In [33]:
print(micro_df.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  electra\_ht\_small &  electra\_ht\_base &  electra\_se\_small &  electra\_se\_base \\
\midrule
ap       &            0.8976 &           0.9230 &            0.6986 &           0.7776 \\
roc\_auc  &            0.9060 &           0.9232 &            0.6980 &           0.7994 \\
f1\_score &            0.8142 &           0.8336 &            0.7060 &           0.7716 \\
accuracy &            0.8189 &           0.8365 &            0.6579 &           0.7099 \\
\bottomrule
\end{tabular}



In [34]:
print(micro_df.to_markdown())

|          |   electra_ht_small |   electra_ht_base |   electra_se_small |   electra_se_base |
|:---------|-------------------:|------------------:|-------------------:|------------------:|
| ap       |             0.8976 |            0.923  |             0.6986 |            0.7776 |
| roc_auc  |             0.906  |            0.9232 |             0.698  |            0.7994 |
| f1_score |             0.8142 |            0.8336 |             0.706  |            0.7716 |
| accuracy |             0.8189 |            0.8365 |             0.6579 |            0.7099 |
