In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [3]:
import math
from typing import *

import pandas as pd
import numpy as np
import scipy.special as sc_special
import sklearn.metrics as sk_metrics

import lass.datasets
import lass.metrics
import lass.metrics.brier
import lass.test
from lass.train import DataArgs

## Get Predictions

In [4]:
# model_loc = "../artifacts/assessors/deberta-nt-bs16*2-0sh-instance-split-07120027/checkpoint-8000"
# model_loc = "./deberta-nt-better-split-bs16*2-0sh-instance-split-07141059/checkpoint-9000"
model_loc = "../artifacts/assessors/deberta-nt-bs16*2-0sh-task-split-07121735/checkpoint-2000"

results = lass.test.test(
    data_args=DataArgs(
        logdir="../artifacts/logs",
        tasks="paper-full",
        model_families=["BIG-G T=0"],
        model_sizes=["128b"],
        shots=[0],
        query_types=["multiple_choice"],
    ),
    # split = 'instance',
    split = 'task',
    model_loc=model_loc,
    model_name = "microsoft/deberta-v3-base",
    max_sequence_length = 512,
)

{'metrics': {'conf-absolute': {'acc': {'test': 0.709478021978022,
                                       'train': 0.7687121918921359},
                               'balanced_acc': {'test': 0.5016260777979034,
                                                'train': 0.5104928543475661},
                               'bs': {'test': 0.2584431138604377,
                                      'train': 0.20531002639072962},
                               'bs_dcr': {'test': 0.0010234770416069072,
                                          'train': 0.022187596321620484},
                               'bs_mcb': {'test': 0.05341959820981243,
                                          'train': 0.05130582405729314},
                               'bs_unc': {'test': 0.2060469926922322,
                                          'train': 0.17619179865505696},
                               'roc_auc': {'test': 0.491734003176178,
                                           'train': 0.6941526913276941}}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/67 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 11648
  Batch size = 8


In [5]:
data = results["data"]
train = results["train"]
test = results["test"]
logits = results["logits"]
labels = results["labels"]
metrics = results["metrics"]

In [6]:
data['n_targets'] = data['targets'].map(lambda x: len(x))
data['conf_normalized'] = data['normalized_scores'].map(lambda s: math.exp(np.max(s)))
data['conf_absolute']= data['absolute_scores'].map(lambda s: math.exp(np.max(s)))
test['n_targets'] = test['targets'].map(lambda x: len(x))
test['conf_normalized'] = test['normalized_scores'].map(lambda s: math.exp(np.max(s)))
test['conf_absolute']= test['absolute_scores'].map(lambda s: math.exp(np.max(s)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['n_targets'] = test['targets'].map(lambda x: len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['conf_normalized'] = test['normalized_scores'].map(lambda s: math.exp(np.max(s)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['conf_absolute']= test['absolute_scores'].map(lambda s:

## Analyse

In [7]:
print(metrics)
predictions = np.argmax(logits, axis=-1)
confs = sc_special.softmax(logits, axis=-1)[:, -1]
confs

{'test_loss': 0.5579606890678406, 'test_accuracy': 0.7206387362637363, 'test_precision': 0.7025316455696202, 'test_recall': 0.06564163217031342, 'test_f1': 0.12006489994591671, 'test_roc_auc': 0.712437953424164, 'test_bs': 0.18547732043134132, 'test_bs_mcb': 0.005516429596843314, 'test_bs_dsc': 0.02608610185773419, 'test_bs_unc': 0.2060469926922322, 'test_balanced_accuracy': 0.5271348736704459, 'test_runtime': 165.6678, 'test_samples_per_second': 70.309, 'test_steps_per_second': 8.789}


array([0.3852105 , 0.41795602, 0.41615838, ..., 0.3538577 , 0.3846273 ,
       0.22022294], dtype=float32)

In [8]:
test = test[test['correct'].isin([0.0, 1.0])].copy()
test['assr'] = confs
test.to_csv(f"ilr_{model_loc.split('/')[-2]}.csv")

In [9]:
# test = pandas.read_csv(f"ilr_{model_loc.split('/')[-2]}.csv")

In [10]:
import ipywidgets as widgets
from ipywidgets import Button, Layout, HBox

def scrollwrap(df):
    b=widgets.HTML(
        value=df.to_html(escape=False),
        disabled=True
    )
    return HBox([b], layout=Layout(height='300px', overflow_y='auto'))

In [11]:
def aggr(group):
    if len(group) == 1:
        return None
        # return pd.Series({'count': 1})

    bs, mcb, dsc, unc = lass.metrics.brier.brier_score(group['correct'], group['assr'])
    accuracy = sk_metrics.accuracy_score(group['correct'], group['assr'] > 0.5)
    try:
        roc_auc = sk_metrics.roc_auc_score(group['correct'], group['assr']),
        lm_roc_auc = sk_metrics.roc_auc_score(group['correct'], group['conf_normalized']),
    except ValueError:
        roc_auc = None
        lm_roc_auc = None
    return pd.Series({
        'bs': bs, 
        'mcb': mcb,
        'dsc': dsc, 
        'unc': unc,
        'roc_auc': roc_auc[0] if roc_auc is not None else None,
        'lm_roc_auc': lm_roc_auc[0] if lm_roc_auc is not None else None,
        'task_accuracy': group['correct'].mean(),
        'accuracy': accuracy,
        'count': len(group),
    })

task_perf = test.groupby('task').apply(aggr).reset_index()
task_perf['acc_improvement'] = task_perf.apply(lambda r: r['accuracy'] - max(r['task_accuracy'], 1-r['task_accuracy']), axis=1)
task_perf.to_csv(f"ilr_{model_loc.split('/')[-2]}_task_perf.csv")

scrollwrap(task_perf
    # .query('count > 10')
    .sort_values(by='acc_improvement', ascending=False))

# test

HBox(children=(HTML(value='<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;"…