## **Setup**

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)


# Common imports
import numpy as np
import os
import time
import memory_profiler
import pandas as pd
import random
import time
from IPython.display import HTML

# scikit-learn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier

# NLP imports
import re
import transformers
from datasets import Dataset, ClassLabel, load_dataset, Features, Value
import torch

# Transformers
from transformers import AutoTokenizer, AutoModel, AutoAdapterModel


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline 
import matplotlib as mpl 

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from plotly.offline import iplot
import plotly.express as px

import seaborn as sns
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the data, results and images
ROOT_DIR = "../"

DATA_PATH = os.path.join(ROOT_DIR, "data")
os.makedirs(DATA_PATH, exist_ok=True)

RESULTS_PATH = os.path.join(ROOT_DIR, "results")
os.makedirs(RESULTS_PATH, exist_ok=True)

## **Few Functions**

In [6]:
@torch.no_grad()
def generate_embeddings(abstracts, tokenizer, model, device):
    """
    Generate embeddings using BERT-based model.

    Args:
    abstracts : list
        Abstract texts.
    tokenizer : transformers.models.bert.tokenization_bert_fast.BertTokenizerFast
        Tokenizer.
    model : transformers.models.bert.modeling_bert.BertModel
        BERT-based model.
    device : str, {"cuda", "cpu"}
        "cuda" if torch.cuda.is_available() else "cpu".
        
    Returns:
    embedding_cls : ndarray
        [CLS] tokens of the abstracts.
    embedding_sep : ndarray
        [SEP] tokens of the abstracts.
    embedding_av : ndarray
        Average of tokens of the abstracts.
    """
    
    # preprocess the input
    inputs = tokenizer(
        abstracts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=512,
    ).to(device)

    # inference
    outputs = model(**inputs, output_hidden_states=True) 
    last_hidden = outputs.hidden_states[-1].cpu().detach() 
    second_last = outputs.hidden_states[-2].cpu().detach() 
    
    embedding_av = torch.mean(second_last[:,1:-1,:],[0,1]).numpy()
    embedding_sep = last_hidden[:, -1, :].numpy()
    embedding_cls = last_hidden[:, 0, :].numpy()

    
    return embedding_cls, embedding_sep, embedding_av

In [7]:
def knn_accuracy(selected_embeddings, true_labels, k=10, rs=42, **kwargs):
    """
    Calculate k-NN classification accuracy using cross-validation.

    This function calculates the k-NN classification accuracy using cross-validation with the specified number of
    neighbors (k) and a random seed (rs). The k-NN model is trained and tested on the provided selected_embeddings and
    true_labels.

    Parameters
    ----------
    selected_embeddings : array-like
        The input embeddings to be used for classification.
    true_labels : array-like
        Array of labels corresponding to the selected_embeddings records.
    k : int, default=10
        Number of folds/splits to use in k-fold cross-validation.
    rs : int, default=42
        Random seed for reproducibility.
    **kwargs : key=value, optional
        Optional keyword arguments to be passed to the kNN classifier.

    Returns
    -------
    dict
    A dictionary containing the kNN CV accuracy and balanced accuracy on the provided embeddings set.
        The dictionary has the following keys:
                - 'accuracy': The average accuracy across all folds.
                - 'balanced_accuracy': The average balanced accuracy across all folds.
    """

    # Set the random seed
    random_state = np.random.seed(rs)

    # Convert selected_embeddings to a NumPy array with proper shape
    feature_mat = np.array(selected_embeddings.tolist())

    # Create StratifiedKFold object
    kf = StratifiedKFold(n_splits=k, random_state=rs, shuffle=True)

    # Instantiate a k-NN model
    knn = KNeighborsClassifier(n_neighbors=10, algorithm='auto', weights='distance', metric='cosine', n_jobs=-1)
    knn = knn.set_params(**kwargs)

    # Define the scoring metrics
    scoring_metrics = ('accuracy', 'balanced_accuracy')

    # Perform cross-validation and calculate the scores
    scores = cross_validate(knn, X=feature_mat, y=true_labels, scoring=scoring_metrics, cv=kf, n_jobs=-1)

    # Create a dictionary to store the accuracy scores
    accuracy_dict = {'accuracy': scores['test_accuracy'].mean(), 'balanced_accuracy': scores['test_balanced_accuracy'].mean(),
                     'accuracy_std': scores['test_accuracy'].std() * 2, 'balanced_accuracy_std': scores['test_balanced_accuracy'].std() * 2}

    return accuracy_dict



In [8]:
def compare_embeddings(embeddings_all, y, k, strategy, tables=None, save=False, **kwargs):
    """
    Compares different sets of embeddings using kNN accuracy and balanced accuracy.

    Parameters
    ----------
    embeddings_all : iterable
        Iterable containing different sets of embeddings to compare.
    y : pd.Series
        Pandas Series containing the labels.
    k : int
        Positive integer controlling the number of splits in k-fold cross-validation.
    strategy : str
        String indicating the embedding extraction strategy to use ('[CLS]' or '[SEP]' or 'AVG').
    tables : tuple or list of pd.DataFrame, optional
        Tuple or list of DataFrames containing pre-existing tables for accuracy and balanced accuracy.
        If None, new tables will be created.
    save : bool, default=False
        Whether to save the accuracy and balanced accuracy tables to CSV files.
    **kwargs : dict, optional
        Optional keyword arguments to be passed to the knn_accuracy function.

    Returns
    -------
    pd.DataFrame, pd.DataFrame
        Two DataFrames containing the accuracy and balanced accuracy scores, respectively.

    Note
    ----
    - embeddings_all must be an iterable.
    - y must be a pd.Series.
    - k is a positive integer controlling the number of k-fold splits.
    - strategy must be a string either '[CLS]' or '[SEP]' or 'AVG'.
    - tables must be a tuple or list of DataFrames.

    """

    model_names = [model.replace('_',' ') for model in models_norm]

    if tables is None:
        # Create new tables for accuracy and balanced accuracy
        accuracy = pd.DataFrame(index=model_names, columns=['[CLS]', '[SEP]', 'AVG'])
        balanced_acc = pd.DataFrame(index=model_names, columns=['[CLS]', '[SEP]', 'AVG'])
    else:
        # Use pre-existing tables for accuracy and balanced accuracy
        accuracy = tables[0]
        balanced_acc = tables[1]

    for i, embeddings in enumerate(embeddings_all):
        # Calculate kNN accuracy and balanced accuracy for each set of embeddings
        results = knn_accuracy(embeddings, y, k=k, **kwargs)
        accuracy.at[model_names[i], strategy] = results['accuracy']
        balanced_acc.at[model_names[i], strategy] = results['balanced_accuracy']

    label = y.name
    if save:
        # Save accuracy and balanced accuracy tables to CSV files
        accuracy.to_csv(os.path.join(RESULTS_PATH, f'{label}_accuracy.csv'))
        balanced_acc.to_csv(os.path.join(RESULTS_PATH, f'{label}_balanced_acc.csv'))

    return accuracy, balanced_acc


In [9]:
def chance_knn_accuracy(Zs, true_labels, k=10, rs=42, **kwargs):
    """Chance kNN accuracy.
    Calculate chance scores for a given set of embeddings and true_labels, using cross-validation.
    Note that the dataset does not really matter since the dummy classifier does not look for neighbors but just randomly draws one of the labels as prediction.
    For efficiency, you could just provide one embeddings set from any of the models.

    Parameters
    ----------
    Zs : list of array-like
        List with the different datasets for which to calculate the chance accuracy.
    true_labels : array-like
        Array with labels.
    k : int, default=10
        The number of splits in the stratified k-fold cross-validation.
    rs : int, default=42
        Random seed for reproducibility.
    **kwargs : dict, optional
        Optional keyword arguments to be passed to the DummyClassifier object.
    
    Returns
    -------
    chance_dict : dict
        A dictionary containing the chance scores for 'accuracy', 'ba' (balanced accuracy),
        'accuracy_std' (standard deviation of accuracy), and 'balanced_accuracy_std'
        (standard deviation of balanced accuracy) across all folds.
    """
    
    accs=[]
    baccs=[]
    accs_sd=[]
    baccs_sd=[]
    
    for i, Xrp in enumerate(Zs):
        # Convert selected_embeddings to a NumPy array with proper shape
        feature_mat = np.array(Xrp.tolist())

        # Create StratifiedKFold object
        np.random.seed(rs)
        kf = StratifiedKFold(n_splits=k, random_state=rs, shuffle=True)

        # Instantiate a DummyClassifier object 
        dummy_clf = DummyClassifier(strategy="stratified", random_state=rs)
        dummy_clf.set_params(**kwargs)

        # Define the scoring metrics
        scoring_metrics = ('accuracy', 'balanced_accuracy')

        # Perform cross-validation and calculate the scores
        scores = cross_validate(dummy_clf, X=feature_mat, y=true_labels, scoring=scoring_metrics, cv=kf, n_jobs=-1)

        acc = scores['test_accuracy'].mean()
        bacc = scores['test_balanced_accuracy'].mean()
        acc_sd = scores['test_accuracy'].std() * 2
        bacc_sd = scores['test_balanced_accuracy'].std() *2
        accs.append(acc)
        baccs.append(bacc)
        accs_sd.append(acc_sd)
        baccs_sd.append(bacc_sd)

    # Create a dictionary to store the chance scores
    chance_dict = {'accuracy': np.mean(accs), 'ba': np.mean(baccs),
                   'accuracy_std': np.mean(accs_sd), 'ba_std': np.mean(baccs_sd)}

    return chance_dict

In [10]:
def df_display(df, title, decimals = 3, highlight = False):
    """
    Display a pandas DataFrame with specified formatting options and optionally highlight max and min values.

    Parameters:
        df (pandas.DataFrame): The DataFrame to display.
        title (str): The title to be displayed as the caption for the table.
        decimals (int, optional): The number of decimal places to display. Default is 3.
        highlight (bool, optional): If True, highlight the maximum and minimum column-wise values in the DataFrame. Default is False.

    Returns:
        pandas.io.formats.style.Styler: A Styler object with the specified formatting and, optionally, highlighted values.
    """

    # Create a style object with the specified formatting options
    style = (
        df.style
        .set_caption(title)                                                             # Set the caption for the table
        .set_precision(decimals)                                                        # Set the number of decimal places to display
        .set_properties(**{'font-size': '13pt'})                                        # Set the font size for the cells
        .set_table_styles([{'selector': 'caption', 'props': [('font-size', '14pt')]}])  # Set the font size for the caption
    )
    
    # If highlight is True, apply the highlight_max and highlight_min methods to the style object
    if highlight:
        style = style.highlight_max(axis=0, color='#90EE90').highlight_min(axis=0, color='#FFB6C1')
        
    return style