In [1]:
!pip -q install pandas --upgrade

[31mERROR: pandas-profiling 2.5.0 has requirement pandas==0.25.3, but you'll have pandas 1.0.3 which is incompatible.[0m
[31mERROR: hypertools 0.6.2 has requirement scikit-learn<0.22,>=0.19.1, but you'll have scikit-learn 0.22.2.post1 which is incompatible.[0m


In [2]:
import os
import pickle
import json

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, average_precision_score, roc_auc_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
import transformers as trfm
from transformers import AutoTokenizer, TFAutoModel
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

## Helper functions

In [3]:
def build_reranker(tokenizer, model):
    tokenizer.enable_padding()
    
    def rerank(question, answers):
        pairs = list(zip([question] * len(answers), answers))

        encs = tokenizer.encode_batch(pairs)
        input_ids = np.array([enc.ids for enc in encs])
        scores = model.predict(input_ids).squeeze()

        return scores
    
    return rerank

In [4]:
def touch_dir(dirname):
    if not os.path.exists(dirname):
        os.makedirs(dirname)
        print(f"Created directory {dirname}.")
    else:
        print(f"Directory {dirname} already exists.")

In [5]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512, enable_padding=False):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    
    ---
    Inputs:
        tokenizer: the `fast_tokenizer` that we imported from the tokenizers library
    """
    tokenizer.enable_truncation(max_length=maxlen)
    if enable_padding:
        tokenizer.enable_padding(max_length=maxlen)
    
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [6]:
def combine_qa_ids(q_ids, a_ids, tokenizer, maxlen=512):
    """
    Given two arrays of IDs (questions and answers) created by
    `fast_encode`, we combine and pad them.
    Inputs:
        tokenizer: The original tokenizer (not the fast_tokenizer)
    """
    combined_ids = []

    for i in tqdm(range(q_ids.shape[0])):
        ids = []
        ids.append(tokenizer.cls_token_id)
        ids.extend(q_ids[i])
        ids.append(tokenizer.sep_token_id)
        ids.extend(a_ids[i])
        ids.append(tokenizer.sep_token_id)
        ids.extend([tokenizer.pad_token_id] * (maxlen - len(ids)))

        combined_ids.append(ids)
    
    return np.array(combined_ids)

In [7]:
def encode_qa(questions, answers, tokenizer, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(questions))):
        q = questions[i]
        a = answers[i]
        
        encs = tokenizer.encode(q, a)
        all_ids.append(encs.ids)
        if len(encs.ids) > 512:
            return q, a
    
    return np.array(all_ids)

In [8]:
def build_model(transformer, max_len=None):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_ids = L.Input(shape=(max_len, ), dtype=tf.int32)
    
    x = transformer(input_ids)[0]
    x = x[:, 0, :]
    x = L.Dense(1, activation='sigmoid', name='sigmoid')(x)
    
    # BUILD AND COMPILE MODEL
    model = Model(inputs=input_ids, outputs=x)
    model.compile(
        loss='binary_crossentropy', 
        metrics=['accuracy'], 
        optimizer=Adam(lr=1e-5)
    )
    
    return model

In [9]:
def load_model(sigmoid_dir, transformer_dir='transformer', architecture="electra", max_len=None):
    """
    Special function to load a keras model that uses a transformer layer
    """
    sigmoid_path = os.path.join(sigmoid_dir,'sigmoid.pickle')
    
    if architecture == 'electra':
        transformer = TFElectraModel.from_pretrained(transformer_dir)
    else:
        transformer = TFAutoModel.from_pretrained(transformer_dir)
    model = build_model(transformer, max_len=max_len)
    
    sigmoid = pickle.load(open(sigmoid_path, 'rb'))
    model.get_layer('sigmoid').set_weights(sigmoid)
    
    return model

In [10]:
tokenizer = trfm.AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
fast_tokenizer = BertWordPieceTokenizer('/kaggle/input/finetune-mdistilbert-on-healthtap/vocab.txt', lowercase=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=618.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




## Load Models

In [11]:
models = {}

In [12]:
models['mdistilbert_ht'] = load_model(
    sigmoid_dir='/kaggle/input/finetune-mdistilbert-on-healthtap/transformer',
    transformer_dir='/kaggle/input/finetune-mdistilbert-on-healthtap/transformer',
    architecture='distilbert-multilingual',
    max_len=None
)

models['mdistilbert_ht'].summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, None, 768),)      134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________


In [13]:
models['mdistilbert_se'] = load_model(
    sigmoid_dir='/kaggle/input/finetune-mdistilbert-on-stackexchange/transformer',
    transformer_dir='/kaggle/input/finetune-mdistilbert-on-stackexchange/transformer',
    architecture='distilbert-multilingual',
    max_len=None
)

models['mdistilbert_se'].summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_distil_bert_model_1 (TFDi ((None, None, 768),)      134734080 
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
sigmoid (Dense)              (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________


## Load Data

In [14]:
MAX_LEN = 512

df = pd.read_csv('/kaggle/input/covidqa/multilingual.csv')

In [15]:
correct_ids = encode_qa(df.question.values.astype(str), df.answer.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)
wrong_ids = encode_qa(df.question.values.astype(str), df.wrong_answer.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)

HBox(children=(FloatProgress(value=0.0, max=888.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=888.0), HTML(value='')))




In [16]:
input_ids = np.concatenate([correct_ids, wrong_ids])

labels = np.concatenate([
    np.ones(correct_ids.shape[0]),
    np.zeros(correct_ids.shape[0])
]).astype(np.int32)

## Compute Scores

In [17]:
score_df = pd.concat([df[['language']]]*2)

for model_name, model in models.items():
    %time score_df[model_name] = model.predict(input_ids, batch_size=64)

CPU times: user 3.81 s, sys: 1.1 s, total: 4.91 s
Wall time: 19 s
CPU times: user 3.71 s, sys: 1.07 s, total: 4.78 s
Wall time: 18.9 s


In [18]:
score_df['labels'] = labels

In [19]:
score_df.to_csv("multilingual.csv", index=False)

## Compute Prediction Results

### Macro-Average

In [20]:
overall = {}

for model_name in models.keys():
    result = {}
    labels = score_df['labels']
    score = score_df[model_name]
    pred = score.round().astype(int)
    result['ap'] = average_precision_score(labels, score).round(4)
    result['roc_auc'] = roc_auc_score(labels, score).round(4)
    result['f1_score'] = f1_score(labels, pred).round(4)
    result['accuracy'] = accuracy_score(labels, pred).round(4)
    overall[model_name] = result

overall_df = pd.DataFrame(overall)
overall_df.to_csv("overall_results.csv")
overall_df

Unnamed: 0,mdistilbert_ht,mdistilbert_se
ap,0.7635,0.5611
roc_auc,0.7709,0.5963
f1_score,0.7219,0.688
accuracy,0.6222,0.5501


In [21]:
print(overall_df.to_latex())

\begin{tabular}{lrr}
\toprule
{} &  mdistilbert\_ht &  mdistilbert\_se \\
\midrule
ap       &          0.7635 &          0.5611 \\
roc\_auc  &          0.7709 &          0.5963 \\
f1\_score &          0.7219 &          0.6880 \\
accuracy &          0.6222 &          0.5501 \\
\bottomrule
\end{tabular}



In [22]:
print(overall_df.to_markdown())

|          |   mdistilbert_ht |   mdistilbert_se |
|:---------|-----------------:|-----------------:|
| ap       |           0.7635 |           0.5611 |
| roc_auc  |           0.7709 |           0.5963 |
| f1_score |           0.7219 |           0.688  |
| accuracy |           0.6222 |           0.5501 |


## By source

In [23]:
all_languages = {}

for language in df.language.unique():
    language_results = {}
    score_language_df = score_df[score_df.language == language]

    for model_name in models.keys():
        result = {}
        labels = score_language_df['labels']
        score = score_language_df[model_name]
        pred = score.round().astype(int)
        result['ap'] = average_precision_score(labels, score).round(4)
        result['roc_auc'] = roc_auc_score(labels, score).round(4)
        result['f1_score'] = f1_score(labels, pred).round(4)
        result['accuracy'] = accuracy_score(labels, pred).round(4)
        
        language_results[model_name] = result
    
    all_languages[language] = pd.DataFrame(language_results)

### Regular output

In [24]:
for language, sdf in all_languages.items():
    print(language)
    print('-'*40)
    print(sdf)
    print('='*40)

chinese
----------------------------------------
          mdistilbert_ht  mdistilbert_se
ap                0.8075          0.5281
roc_auc           0.8015          0.5402
f1_score          0.7352          0.6736
accuracy          0.6442          0.5215
english
----------------------------------------
          mdistilbert_ht  mdistilbert_se
ap                0.8191          0.6495
roc_auc           0.8249          0.6598
f1_score          0.7619          0.7081
accuracy          0.6948          0.5901
korean
----------------------------------------
          mdistilbert_ht  mdistilbert_se
ap                0.5926          0.5091
roc_auc           0.5371          0.4900
f1_score          0.6607          0.6667
accuracy          0.5065          0.5000
spanish
----------------------------------------
          mdistilbert_ht  mdistilbert_se
ap                0.7892          0.5546
roc_auc           0.8089          0.6023
f1_score          0.7494          0.7000
accuracy          0.6698  

### Latex output

In [25]:
for language, sdf in all_languages.items():
    print(language)
    print('-'*40)
    print(sdf.to_latex())
    print('='*40)

chinese
----------------------------------------
\begin{tabular}{lrr}
\toprule
{} &  mdistilbert\_ht &  mdistilbert\_se \\
\midrule
ap       &          0.8075 &          0.5281 \\
roc\_auc  &          0.8015 &          0.5402 \\
f1\_score &          0.7352 &          0.6736 \\
accuracy &          0.6442 &          0.5215 \\
\bottomrule
\end{tabular}

english
----------------------------------------
\begin{tabular}{lrr}
\toprule
{} &  mdistilbert\_ht &  mdistilbert\_se \\
\midrule
ap       &          0.8191 &          0.6495 \\
roc\_auc  &          0.8249 &          0.6598 \\
f1\_score &          0.7619 &          0.7081 \\
accuracy &          0.6948 &          0.5901 \\
\bottomrule
\end{tabular}

korean
----------------------------------------
\begin{tabular}{lrr}
\toprule
{} &  mdistilbert\_ht &  mdistilbert\_se \\
\midrule
ap       &          0.5926 &          0.5091 \\
roc\_auc  &          0.5371 &          0.4900 \\
f1\_score &          0.6607 &          0.6667 \\
accuracy &       

### Markdown output

In [26]:
for language, sdf in all_languages.items():
    print(language)
    print('-'*40)
    print(sdf.to_markdown())
    print('='*40)

chinese
----------------------------------------
|          |   mdistilbert_ht |   mdistilbert_se |
|:---------|-----------------:|-----------------:|
| ap       |           0.8075 |           0.5281 |
| roc_auc  |           0.8015 |           0.5402 |
| f1_score |           0.7352 |           0.6736 |
| accuracy |           0.6442 |           0.5215 |
english
----------------------------------------
|          |   mdistilbert_ht |   mdistilbert_se |
|:---------|-----------------:|-----------------:|
| ap       |           0.8191 |           0.6495 |
| roc_auc  |           0.8249 |           0.6598 |
| f1_score |           0.7619 |           0.7081 |
| accuracy |           0.6948 |           0.5901 |
korean
----------------------------------------
|          |   mdistilbert_ht |   mdistilbert_se |
|:---------|-----------------:|-----------------:|
| ap       |           0.5926 |           0.5091 |
| roc_auc  |           0.5371 |           0.49   |
| f1_score |           0.6607 |       

## AP Score by source

In [27]:
ap_df = pd.DataFrame({language: sdf.loc['ap'] for language, sdf in all_languages.items()}).T
ap_df

Unnamed: 0,mdistilbert_ht,mdistilbert_se
chinese,0.8075,0.5281
english,0.8191,0.6495
korean,0.5926,0.5091
spanish,0.7892,0.5546
vietnamese,0.6264,0.5994
arabic,0.7339,0.5669
french,0.8605,0.5876
russian,0.7951,0.4844


In [28]:
print(ap_df.to_latex())

\begin{tabular}{lrr}
\toprule
{} &  mdistilbert\_ht &  mdistilbert\_se \\
\midrule
chinese    &          0.8075 &          0.5281 \\
english    &          0.8191 &          0.6495 \\
korean     &          0.5926 &          0.5091 \\
spanish    &          0.7892 &          0.5546 \\
vietnamese &          0.6264 &          0.5994 \\
arabic     &          0.7339 &          0.5669 \\
french     &          0.8605 &          0.5876 \\
russian    &          0.7951 &          0.4844 \\
\bottomrule
\end{tabular}



In [29]:
print(ap_df.to_markdown())

|            |   mdistilbert_ht |   mdistilbert_se |
|:-----------|-----------------:|-----------------:|
| chinese    |           0.8075 |           0.5281 |
| english    |           0.8191 |           0.6495 |
| korean     |           0.5926 |           0.5091 |
| spanish    |           0.7892 |           0.5546 |
| vietnamese |           0.6264 |           0.5994 |
| arabic     |           0.7339 |           0.5669 |
| french     |           0.8605 |           0.5876 |
| russian    |           0.7951 |           0.4844 |


## Micro Scores

In [30]:
micro_df = (sum(all_languages.values()) / len(all_languages)).round(4)
micro_df

Unnamed: 0,mdistilbert_ht,mdistilbert_se
ap,0.753,0.56
roc_auc,0.7526,0.5807
f1_score,0.7119,0.6856
accuracy,0.6012,0.544


In [31]:
print(micro_df.to_latex())

\begin{tabular}{lrr}
\toprule
{} &  mdistilbert\_ht &  mdistilbert\_se \\
\midrule
ap       &          0.7530 &          0.5600 \\
roc\_auc  &          0.7526 &          0.5807 \\
f1\_score &          0.7119 &          0.6856 \\
accuracy &          0.6012 &          0.5440 \\
\bottomrule
\end{tabular}



In [32]:
print(micro_df.to_markdown())

|          |   mdistilbert_ht |   mdistilbert_se |
|:---------|-----------------:|-----------------:|
| ap       |           0.753  |           0.56   |
| roc_auc  |           0.7526 |           0.5807 |
| f1_score |           0.7119 |           0.6856 |
| accuracy |           0.6012 |           0.544  |
