In [None]:
import sys
import os
sys.version

'3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]'

Mounted at /content/drive
Pytorch version: 2.2.1+cu121
Device name: Tesla T4


In [None]:
# import libraries
import torch
import torch.nn as nn
import torchtext
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
import torch.optim as optim
import torch.nn.functional as F
from transformers import AdamW, DistilBertTokenizerFast, DistilBertModel
import optuna
from optuna.trial import TrialState
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.random_projection import SparseRandomProjection
import time

# useful .py
from settings import * # settings
from dataset import * # data pre-processing
from model import * # models
from optimization import * # model selection, training, evaluation
from uncertainty import * # uncertainty quantification and xai

import warnings
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)

### Experiments for TTT, TTT-SRP, non pretrained baselines (EarlyConcat, LateFuse, MulT, TFN), and ablation studies


*   performance
*   uncertainty quantification (SD, LAC, MCD)



In [None]:
# load settings
DATASET = "cloth" # choose in {"airbnb", "cloth", "jigsaw", "kick", "pet", "salary, "wine_10", "wine_100"}
FILENAME, categorical_var, numerical_var, text_var, MAX_LEN_QUANTILE, N_CLASSES, WEIGHT_DECAY, FACTOR, N_EPOCHS, split_val, CRITERION, N_SEED, DROPOUT= load_settings(dataset = DATASET)

perf_results = pd.DataFrame()
uncertainty_results = pd.DataFrame()
val_uncertainty_results = pd.DataFrame()
i = 0

for SEED in range(N_SEED):

    for MODEL_TYPE in ["MulT", "EarlyConcat", "LateFuse", "TFN", "TTT", "TTT-SRP", "TTT_ablation1", "TTT_ablation2", "TTT_ablation3", "TTT-PCA", "TTT-Kaiming"]:

        start = time.time()
        perf_results.loc[i,"seed"] = SEED
        perf_results.loc[i,"model type"] = MODEL_TYPE

        # GPU or CPU
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # load and prepare dataset
        df = preprocess_dataset(DATASET, MODEL_TYPE)

        # Train/Test split
        df, target = train_test_split(df, test_size = split_val, random_state = SEED)

        if MODEL_TYPE in ["TTT-SRP", "TTT-PCA", "TTT-Kaiming"]:
            PATIENCE = 1
            # text cleaning (keep only words and numbers)
            df['clean_text'] = df[text_var].apply(lambda row:clean_text(row))
            target['clean_text'] = target[text_var].apply(lambda row:clean_text(row))

            # Load the specific tokenizer and text model
            tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', do_lower_case=True)
            text_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

            # text max length
            MAX_LEN = int(np.quantile(df.apply(lambda row : len(tokenizer(row['clean_text']).input_ids), axis=1).values, q = [MAX_LEN_QUANTILE]).item())
            MAX_LEN = min(MAX_LEN, 512) # maximum sequence length is 512 for DistilBERT

            # vocabulary construction
            VOCAB_SIZE = text_model.embeddings.word_embeddings.num_embeddings

        else:
            PATIENCE = 4
            # text max length
            MAX_LEN = int(np.quantile(df.apply(lambda row : len(row[text_var].split()), axis=1).values, q = [MAX_LEN_QUANTILE]).item())

            # vocabulary construction
            vocab2index, VOCAB_SIZE, words = vocabulary(df, text_field = text_var)

            # encode text on Source and Target
            df['encoded_var'] = df[text_var].apply(lambda x: encode_sentence(x,vocab2index,max_len=MAX_LEN))
            target['encoded_var'] = target[text_var].apply(lambda x: encode_sentence(x,vocab2index,max_len=MAX_LEN))

        perf_results.loc[i,"max text length"] = MAX_LEN
        perf_results.loc[i,"vocab size"] = VOCAB_SIZE

        # Numerical variables pre-processing
        numerical_var_scaled = standardScaling(df, target, numerical_var)
        QUANTILES = []
        for var in numerical_var_scaled:
            QUANTILES.append(np.quantile(df[var].values, q = [0., 0.2, 0.4, 0.6, 0.8, 1.]))
        NUM_NUMERICAL_VAR = len(numerical_var)

        # Categorical variables pre-processing
        if MODEL_TYPE in ["LateFuse", "TFN"]:
            categorical_var_oe = oneHotEncoding(df, target, categorical_var)
            NUM_CAT_VAR = len(categorical_var_oe)
            CAT_VOCAB_SIZES = None
        else:
            categorical_var_oe, CAT_VOCAB_SIZES = ordinalEncoding(df, target, categorical_var)
            NUM_CAT_VAR = len(categorical_var)

        # train / validation split
        df_train, df_validation = train_test_split(df, test_size = split_val, random_state = SEED)
        perf_results.loc[i,"training size"] = df_train.shape[0]
        perf_results.loc[i,"test size"] = target.shape[0]

        if MODEL_TYPE in ["TTT-SRP", "TTT-PCA", "TTT-Kaiming"]:
            # prepare the Tensor Datasets, including tokenization
            dataset_train = prepareTensorDatasetWithTokenizer(df_train, "clean_text", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=False, model_type = MODEL_TYPE)
            dataset_validation = prepareTensorDatasetWithTokenizer(df_validation, "clean_text", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=False, model_type = MODEL_TYPE)
            dataset_target = prepareTensorDatasetWithTokenizer(target, "clean_text", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=False, model_type = MODEL_TYPE)


        else:
            # prepare custom datasets
            dataset_train, dataset_validation, dataset_target = prepareCustomDatasets(df_train, df_validation, target,
                                                                                  encoded_text_var = 'encoded_var',
                                                                                  categorical_var = categorical_var_oe,
                                                                                  numerical_var = numerical_var_scaled,
                                                                                  label = 'Y')

        # HPO
        if MODEL_TYPE in ["TTT-SRP", "TTT-PCA", "TTT-Kaiming"]:
            if MODEL_TYPE == "TTT-SRP":
              init_weights = "random_proj"
            elif MODEL_TYPE == "TTT-PCA":
              init_weights = "pca"
            else:
              init_weights = "kaiming"
            best_params = hp_optimization_large(MODEL_TYPE,dataset_train, dataset_validation,
                                              text_model, init_weights,
                                              MAX_LEN, VOCAB_SIZE, CAT_VOCAB_SIZES, NUM_CAT_VAR, NUM_NUMERICAL_VAR, QUANTILES,
                                              criterion=CRITERION, seed=SEED, device = device, dataset = DATASET)

        else:
            best_params = hp_optimization(MODEL_TYPE,dataset_train, dataset_validation,
                                          MAX_LEN, VOCAB_SIZE, CAT_VOCAB_SIZES, NUM_CAT_VAR, NUM_NUMERICAL_VAR, QUANTILES,
                                          criterion=CRITERION, seed=SEED, device = device, dataset = DATASET)


        LR, BATCH_SIZE, D_MODEL, N_LAYERS, N_HEADS = best_params['LR'],best_params['BATCH_SIZE'],best_params['D_MODEL'],best_params['N_LAYERS'],best_params['N_HEADS']

        # same dimension for Feed Forward and Fully Connected
        D_FF = D_MODEL
        D_FC = D_MODEL

        perf_results.loc[i,"LR"] = LR
        perf_results.loc[i,"BATCH_SIZE"] = BATCH_SIZE
        perf_results.loc[i,"D_MODEL"] = D_MODEL
        perf_results.loc[i,"N_LAYERS"] = N_LAYERS
        perf_results.loc[i,"N_HEADS"] = N_HEADS

        # data loaders
        if MODEL_TYPE in ["TTT-SRP", "TTT-PCA", "TTT-Kaiming"]:
            loader_train = DataLoader(dataset_train, sampler = RandomSampler(dataset_train), batch_size = BATCH_SIZE)
            loader_validation = DataLoader(dataset_validation, sampler = SequentialSampler(dataset_validation),batch_size = BATCH_SIZE)
            loader_target = DataLoader(dataset_target, sampler = SequentialSampler(dataset_target),batch_size = BATCH_SIZE)
        else:
            loader_train = DataLoader(dataset_train, batch_size = BATCH_SIZE, shuffle = True)
            loader_validation = DataLoader(dataset_validation, batch_size = BATCH_SIZE, shuffle = True)
            loader_target = DataLoader(dataset_target, batch_size = BATCH_SIZE)


        # model initialization
        model = init_model(model_type = MODEL_TYPE,
                           d_model = D_MODEL,
                           max_len = MAX_LEN,
                           vocab_size = VOCAB_SIZE,
                           cat_vocab_sizes = CAT_VOCAB_SIZES,
                           num_cat_var = NUM_CAT_VAR,
                           num_numerical_var = NUM_NUMERICAL_VAR,
                           quantiles = QUANTILES,
                           n_heads = N_HEADS,
                           d_ff = D_FF,
                           n_layers = N_LAYERS,
                           dropout = DROPOUT,
                           d_fc = D_FC,
                           n_classes = N_CLASSES,
                           seed = SEED,
                           device=device).to(device)

        # specific initialization for input embeddings
        if MODEL_TYPE in ["TTT-SRP", "TTT-PCA"]:
            # Load embeddings weights from pretrained model
            pretrained_dict = text_model.state_dict()
            model_dict = model.state_dict()
            updated_model_dict = {k: v for k, v in model_dict.items()}
            distil_bert_embeddings = text_model.state_dict()['embeddings.word_embeddings.weight']

            # dimension reduction
            if MODEL_TYPE == "TTT-PCA":
                reduction_technique = PCA(n_components=D_MODEL, random_state = SEED)
            if MODEL_TYPE == "TTT-SRP":
                reduction_technique = SparseRandomProjection(n_components=D_MODEL, random_state = SEED)
            reduced_embeddings = torch.tensor(reduction_technique.fit_transform(distil_bert_embeddings.cpu().numpy())).to(device)

            # update embeddings
            updated_model_dict.update([('text_embedding.weight', reduced_embeddings)])
            model_dict.update(updated_model_dict)
            model.load_state_dict(model_dict)

        # number of trainable parameters
        perf_results.loc[i,"trainable parameters"] = sum(p.numel() for p in model.parameters() if p.requires_grad)

        # optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

        # training
        model = training(model, loader_train,  N_EPOCHS, loader_validation, CRITERION, optimizer, PATIENCE, FACTOR, MODEL_TYPE, SEED, verbose=False, device = device, early_stopping = True)

        # model evaluation and uncertainty quantification
        model.eval()
        if MODEL_TYPE in ["MulT", "EarlyConcat", "LateFuse", "TFN"]:
            perf = performance(model, loader_target, MODEL_TYPE, SEED, device)
            perf_results.loc[i,"performance"] = perf
            perf_results.loc[i,"coverage"] = "no"
            perf_results.loc[i,"interval_width"] = "no"
            perf_results.loc[i,"validation coverage"] = "no"
        else:
            # target: performance
            perf, labels, preds, text_preds, tabular_preds = performance(model, loader_target, MODEL_TYPE, SEED, device)
            perf_results.loc[i,"performance"] = perf
            uncertainty_results["labels-"+str(SEED)] = labels
            uncertainty_results["preds-"+str(SEED)] = preds
            uncertainty_results["text_preds-"+str(SEED)] = text_preds
            uncertainty_results["tabular_preds-"+str(SEED)] = tabular_preds

            # (dis)agreement accuracy
            disagr_df = uncertainty_results[uncertainty_results["text_preds-"+str(SEED)]!=uncertainty_results["tabular_preds-"+str(SEED)]]
            agr_df = uncertainty_results[uncertainty_results["text_preds-"+str(SEED)]==uncertainty_results["tabular_preds-"+str(SEED)]]
            disagr_acc = (disagr_df["labels-"+str(SEED)]==disagr_df["preds-"+str(SEED)]).sum()/len(disagr_df)
            agr_acc = (agr_df["labels-"+str(SEED)]==agr_df["preds-"+str(SEED)]).sum()/len(agr_df)
            perf_results.loc[i,"disagreement accuracy"] = disagr_acc
            perf_results.loc[i,"agreement accuracy"] = agr_acc

            # validation: performance
            _, val_labels, val_preds, val_text_preds, val_tabular_preds = performance(model, loader_validation, MODEL_TYPE, SEED, device)
            val_uncertainty_results["labels-"+str(SEED)] = val_labels
            val_uncertainty_results["preds-"+str(SEED)] = val_preds
            val_uncertainty_results["text_preds-"+str(SEED)] = val_text_preds
            val_uncertainty_results["tabular_preds-"+str(SEED)] = val_tabular_preds

            # uncertainty quantification: TTT coverage and prediction set size
            diff = uncertainty_results[ "text_preds-"+str(SEED)] != uncertainty_results[ "tabular_preds-"+str(SEED)]
            interval_width = np.mean(diff+1)
            cov = ((uncertainty_results["labels-"+str(SEED)] == uncertainty_results["text_preds-"+str(SEED)]) | (uncertainty_results["labels-"+str(SEED)] == uncertainty_results[ "tabular_preds-"+str(SEED)]))
            coverage = np.mean(cov)
            val_cov = ((val_uncertainty_results["labels-"+str(SEED)] == val_uncertainty_results["text_preds-"+str(SEED)]) | (val_uncertainty_results["labels-"+str(SEED)] == val_uncertainty_results[ "tabular_preds-"+str(SEED)]))
            val_coverage = np.mean(val_cov)
            perf_results.loc[i,"validation coverage"] = val_coverage
            perf_results.loc[i,"coverage"] = coverage
            perf_results.loc[i,"interval_width"] = interval_width

        if MODEL_TYPE in ["TTT"]:
            # uncertainty quantification: conformal prediction (LAC baseline for uncertainty quantification)
            cp_coverage, cp_interval_width = conformal_prediction(loader_validation,
                                                        loader_target,
                                                        model,
                                                        model_type = MODEL_TYPE,
                                                        target_coverage = val_coverage,
                                                        seed = SEED,
                                                        device = device)
            perf_results.loc[i,"CP coverage"] = cp_coverage
            perf_results.loc[i,"CP interval_width"] = cp_interval_width

            # compute Shannon entropy with Bayesian MCD technique
            total_entropy, aleatoric_entropy = compute_MCD(model, loader_target, n_simu = 50, seed = SEED, device = device)
            perf_results.loc[i,"total_entropy"] = total_entropy
            perf_results.loc[i,"aleatoric_entropy"] = aleatoric_entropy

        else:
            perf_results.loc[i,"CP coverage"] = "no"
            perf_results.loc[i,"CP interval_width"] = "no"
            perf_results.loc[i,"total_entropy"] = "no"
            perf_results.loc[i,"aleatoric_entropy"] = "no"

        elapsed_time = time.time()-start
        perf_results.loc[i,"time"] = elapsed_time

        i+=1

    display(perf_results)



### Experiments for pretrained models (AllTextBERT and LateFuseBERT)

In [None]:
# select DATASET
DATASET = "cloth" # choose in {"airbnb", "cloth", "jigsaw", "kick", "pet", "salary, "wine_10", "wine_100"}
FILENAME, categorical_var, numerical_var, text_var, MAX_LEN_QUANTILE, N_CLASSES, WEIGHT_DECAY, FACTOR, N_EPOCHS, split_val, CRITERION, N_SEED, DROPOUT= load_settings(dataset = DATASET)

# performance records
perf_results = pd.DataFrame()
i = 0

for SEED in range(N_SEED):

    for MODEL_TYPE in ["LateFuseBERT", "AllTextBERT"]:

            start = time.time()

            # GPU or CPU
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            # load and prepare dataset
            df = preprocess_dataset(DATASET, MODEL_TYPE)

            # control randomness
            random.seed(SEED)
            np.random.seed(SEED)
            torch.manual_seed(SEED)
            torch.cuda.manual_seed(SEED)

            # temporary dataframes to compute uncertainty metrics
            uncertainty_results = pd.DataFrame()
            val_uncertainty_results = pd.DataFrame()

            perf_results.loc[i,"model type"] = MODEL_TYPE
            perf_results.loc[i,"seed"] = SEED

            # Train/Test split
            df, target = train_test_split(df, test_size = split_val, random_state = SEED)

            # text cleaning (keep only words and numbers)
            df['clean_text'] = df[text_var].apply(lambda row:clean_text(row))
            target['clean_text'] = target[text_var].apply(lambda row:clean_text(row))

            # Load the specific tokenizer
            tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', do_lower_case=True)

            # text max length
            MAX_LEN = int(np.quantile(df.apply(lambda row : len(tokenizer(row['clean_text']).input_ids), axis=1).values, q = [MAX_LEN_QUANTILE]).item())
            MAX_LEN = min(MAX_LEN, 512) # maximum sequence length is 512 for DistilBERT
            perf_results.loc[i,"max text length"] = MAX_LEN

            # Numerical variables pre-processing
            numerical_var_scaled = standardScaling(df, target, numerical_var)
            NUM_NUMERICAL_VAR = len(numerical_var)
            QUANTILES = []
            for var in numerical_var_scaled:
                QUANTILES.append(np.quantile(df[var].values, q = [0., 0.2, 0.4, 0.6, 0.8, 1.]))

            # Categorical variables pre-processing
            categorical_var_oe, CAT_VOCAB_SIZES = ordinalEncoding(df, target, categorical_var)
            NUM_CAT_VAR = len(categorical_var)

            # train / validation split
            df_train, df_validation = train_test_split(df, test_size = split_val, random_state = SEED)
            perf_results.loc[i,"training size"] = df_train.shape[0]
            perf_results.loc[i,"test size"] = target.shape[0]

            # hyper-parameters
            LR, BATCH_SIZE, D_FC, N_EPOCHS, N_HEADS, N_LAYERS = load_pretrained_settings()
            perf_results.loc[i,"LR"] = LR
            perf_results.loc[i,"BATCH_SIZE"] = BATCH_SIZE
            perf_results.loc[i,"N_HEADS"] = N_HEADS
            perf_results.loc[i,"N_LAYERS"] = N_LAYERS

            # prepare the Tensor Datasets, including tokenization
            dataset_train = prepareTensorDatasetWithTokenizer(df_train, "clean_text", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=True, model_type = MODEL_TYPE)
            dataset_validation = prepareTensorDatasetWithTokenizer(df_validation, "clean_text", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=True, model_type = MODEL_TYPE)
            dataset_target = prepareTensorDatasetWithTokenizer(target, "clean_text", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=True, model_type = MODEL_TYPE)

            # data loaders
            loader_train = DataLoader(dataset_train, sampler = RandomSampler(dataset_train), batch_size = BATCH_SIZE)
            loader_validation = DataLoader(dataset_validation, sampler = SequentialSampler(dataset_validation),batch_size = BATCH_SIZE)
            loader_target = DataLoader(dataset_target, sampler = SequentialSampler(dataset_target),batch_size = BATCH_SIZE)

            # Load Bert with a linear classification layer
            BERT_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

            # model initialization
            torch.manual_seed(SEED)
            model = init_model(model_type = MODEL_TYPE,
                               d_model = BERT_model.embeddings.word_embeddings.embedding_dim, # dimension = 768 for BERT family
                               max_len = "", # not used here
                               vocab_size = "", # not used here
                               cat_vocab_sizes = CAT_VOCAB_SIZES,
                               num_cat_var = NUM_CAT_VAR,
                               num_numerical_var = NUM_NUMERICAL_VAR,
                               quantiles = "", # not used here
                               n_heads = N_HEADS,
                               d_ff = "", # not used here
                               n_layers = N_LAYERS,
                               dropout = DROPOUT,
                               d_fc = D_FC,
                               n_classes = N_CLASSES,
                               seed = SEED,
                               device=device,
                               text_model = BERT_model).to(device)

            # number of trainable parameters
            perf_results.loc[i,"trainable parameters"] = sum(p.numel() for p in model.parameters() if p.requires_grad)

            # optimizer
            optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

            # training
            model, epochs = training_pretrained(model, MODEL_TYPE, loader_train,  N_EPOCHS, loader_validation, CRITERION, optimizer, FACTOR, SEED, verbose=True, device = device)
            perf_results.loc[i,"epochs"] = epochs

            # model evaluation
            model.eval()

            target_perf = performance_pretrained(model, loader_target, MODEL_TYPE, SEED, device)
            perf_results.loc[i,"performance (Target)"] = target_perf

            elapsed_time = time.time()-start
            perf_results.loc[i,"time"] = elapsed_time

            i+=1

    display(perf_results)

