In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [6]:
import math
from typing import *
from pathlib import Path

import pandas as pd
import numpy as np
import scipy.special as sc_special
import sklearn.metrics as sk_metrics

import lass.datasets
import lass.metrics
import lass.metrics.brier
import lass.test
from lass.log_handling import LoaderArgs

## Get Predictions

In [8]:
# model_loc = "../artifacts/assessors/deberta-nt-bs16*2-0sh-instance-split-07120027/checkpoint-8000"
# model_loc = "./deberta-nt-better-split-bs16*2-0sh-instance-split-07141059/checkpoint-9000"
# model_loc = "../artifacts/assessors/deberta-nt-bs16*2-0sh-task-split-07121735/checkpoint-2000"
model_loc = "../artifacts/assessors/deberta-test-bs16*2-0sh-instance-split-07201715/checkpoint-2000"

assert Path(model_loc).exists()

results = lass.test.test(
    data_args=LoaderArgs(
        logdir="../artifacts/logs",
        tasks="paper-lite",
        model_families=["BIG-G T=0"],
        model_sizes=["128b"],
        shots=[0],
        query_types=["multiple_choice"],
    ),
    split = 'instance',
    # split = 'task',
    model_loc=model_loc,
    model_name = "microsoft/deberta-v3-base",
    max_sequence_length = 512,
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'correct'] = df['correct'].astype(int)


{'metrics': {'conf-absolute': {'acc': {'test': 0.6407185628742516,
                                       'train': 0.6404701544134593},
                               'balanced_acc': {'test': 0.5038359329220272,
                                                'train': 0.5048353580910776},
                               'bs': {'test': 0.30333516928709925,
                                      'train': 0.299872121851683},
                               'bs_dcr': {'test': 0.002377095459227818,
                                          'train': 0.004273547898812985},
                               'bs_mcb': {'test': 0.07832987543077966,
                                          'train': 0.07549573543569221},
                               'bs_unc': {'test': 0.2273823893155474,
                                          'train': 0.22864993431480377},
                               'roc_auc': {'test': 0.5170027656787111,
                                           'train': 0.5518847474986639}}

loading configuration file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/config.json from cache at /home/wout/.cache/huggingface/transformers/e6f9db57345f0f60c9f837fa97bcb27b1ed31e99feb33d732d7d8c80cb8f8459.de97182a9f32a68819030ba8f3f6ff2ba47276be3864425925523202f54cc79c
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

loading configuration file ../artifacts/assessors/deberta-test-bs16*2-0sh-instance-split-07201715/checkpoint-2000/config.json
Model config DebertaV2Config {
  "_name_or_path": "../artifacts/assessors/deberta-test-bs16*2-0sh-instance-split-07201715/checkpoint-2000",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": 

In [9]:
data = results["data"]
train = results["train"]
test = results["test"]
logits = results["logits"]
labels = results["labels"]
metrics = results["metrics"]

## Analyse

In [15]:
# print(metrics)
predictions = np.argmax(logits, axis=-1)
confs = sc_special.softmax(logits, axis=-1)[:, -1]
confs

array([0.01135792, 0.01190645, 0.01067761, ..., 0.03131928, 0.75913805,
       0.73769104], dtype=float32)

In [11]:
test['assr'] = confs
test.to_csv(f"ilr_{model_loc.split('/')[-2]}.csv")

In [12]:
# test = pandas.read_csv(f"ilr_{model_loc.split('/')[-2]}.csv")

In [13]:
import ipywidgets as widgets
from ipywidgets import Button, Layout, HBox

def scrollwrap(df):
    b=widgets.HTML(
        value=df.to_html(escape=False),
        disabled=True
    )
    return HBox([b], layout=Layout(height='300px', overflow_y='auto'))

In [14]:
def aggr(group):
    if len(group) == 1:
        return None
        # return pd.Series({'count': 1})

    bs, mcb, dsc, unc = lass.metrics.brier.brier_score(group['correct'], group['assr'])
    accuracy = sk_metrics.accuracy_score(group['correct'], group['assr'] > 0.5)
    try:
        roc_auc = sk_metrics.roc_auc_score(group['correct'], group['assr']),
        lm_roc_auc = sk_metrics.roc_auc_score(group['correct'], group['conf_normalized']),
    except ValueError:
        roc_auc = None
        lm_roc_auc = None
    return pd.Series({
        'bs': bs, 
        'mcb': mcb,
        'dsc': dsc, 
        'unc': unc,
        'roc_auc': roc_auc[0] if roc_auc is not None else None,
        'lm_roc_auc': lm_roc_auc[0] if lm_roc_auc is not None else None,
        'task_accuracy': group['correct'].mean(),
        'accuracy': accuracy,
        'count': len(group),
    })

task_perf = test.groupby('task').apply(aggr).reset_index()
task_perf['acc_improvement'] = task_perf.apply(lambda r: r['accuracy'] - max(r['task_accuracy'], 1-r['task_accuracy']), axis=1)
task_perf.to_csv(f"ilr_{model_loc.split('/')[-2]}_task_perf.csv")

scrollwrap(task_perf
    # .query('count > 10')
    .sort_values(by='acc_improvement', ascending=False))

# test

HBox(children=(HTML(value='<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;"…