In [4]:
import math

import pandas as pd
import numpy as np

import lass.datasets
from lass.log_handling import LogLoader

## Load data

Load all data from logs into a single dataframe.

Sorry memory.

For one model family, e.g. BIG-G T=0, with all sizes, shots, and tasks, this requires about 20GB of memory and takes about 5 minutes to load on the server we used for experiments.

In [5]:
loader = LogLoader(
    logdir='../artifacts/logs',
    tasks='paper-full',
    model_families=['BIG-G T=0'],
    # model_sizes=['128b'],
    query_types=['multiple_choice'],
    # shots=[0],
    include_unknown_shots=True,
    exclude_faulty_tasks=True,
)

In [6]:
bigdf = lass.datasets.to_dataframe(loader)

Some extra columns that we will use a lot

In [7]:
bigdf['n_targets'] = bigdf['targets'].map(lambda x: len(x))
bigdf['conf_normalized'] = bigdf['normalized_scores'].map(lambda s: math.exp(np.max(s)))
bigdf['conf_absolute']= bigdf['absolute_scores'].map(lambda s: math.exp(np.max(s)))

Results for a single model and single number of shots.
This should contain all the testing instances exactly once.

In [8]:
example = bigdf.query('model_family == "BIG-G T=0" & model_name == "128b" & shots == 3')
len(example)

55431

## Basic Stats

In [9]:
df_n_instances = bigdf.groupby(['model_name', 'shots']).size()

print(f"Total #tasks: {bigdf['task'].nunique()}")
print(f"Total #unique instances: {df_n_instances.max()}")
print(f"Total #records: {len(bigdf)}")
print(f"Equal #unique instances per shot and model: {bigdf.groupby(['task', 'model_name', 'shots']).size().groupby(['task']).nunique().nunique() == 1}")
print(f"Different shots: {df_n_instances.index.levels[1].values}") # type: ignore
print(f"Different models: {df_n_instances.index.levels[0].values}") # type: ignore




Total #tasks: 119
Total #unique instances: 78475
Total #records: 2937216
Equal #unique instances per shot and model: True
Different shots: [0 1 2 3]
Different models: ['125m' '128b' '16m' '1b' '244m' '27b' '2b' '2m' '422m' '4b' '53m' '8b']


## Trivia

There are multiple choice questions with only 1 option. It's likely a bug tho, ass all the scores are 0, and this is the only task with a mismatch between `targets` and `target_values` columns.

All answers are wrong.

In [10]:
example[example['n_targets'] == 1].groupby('task')[["n_targets", "correct"]].value_counts()


task        n_targets  correct
arithmetic  1          0.0        992
dtype: int64

In [11]:
example[example['n_targets'] == 1]['conf_absolute'].max()

0.9984526101682027

In [12]:
example[example['n_targets'] != example['target_values'].map(lambda x: len(x))]['task'].unique()

array(['arithmetic'], dtype=object)

There are multiple choice questions with 100 options. This one seems fine.

In [13]:
example[example['n_targets'] == 100][["n_targets", "correct"]].value_counts()

n_targets  correct
100        0.0        271
           1.0         70
dtype: int64

## Varia

In [14]:
df_accs = (bigdf
    .groupby(['task', 'model_name', 'shots'])
    .agg(
        acc=('correct', 'mean'), 
        random_acc=('n_targets', lambda x: (1/x).mean()),
        n_targets=('n_targets', 'mean'),
    )  # type: ignore
)
df_accs['rel_acc_improvement'] = (df_accs['acc'] - df_accs['random_acc']) / (1 - df_accs['random_acc'])
df_accs['acc_improvement'] = (df_accs['acc'] - df_accs['random_acc'])
df_accs_best = (df_accs
    .xs(0, level='shots', drop_level=False)
    .xs('128b', level='model_name', drop_level=False)
    .sort_values('acc', ascending=False))
df_accs_best.to_csv('accs_0.csv')



# acc: 0.75, random acc: 0.25 -> 2/3 | (0.75 - 0.25) / (1 - 0.25) = 1

In [15]:
# df_accs_best = pd.read_csv('../artifacts/csv-datasets/accs_3.csv')
df_accs_best.describe()

Unnamed: 0,acc,random_acc,n_targets,rel_acc_improvement,acc_improvement
count,119.0,119.0,119.0,119.0,119.0
mean,0.373269,0.303065,5.941128,0.08983,0.070204
std,0.163385,0.148857,9.604588,0.209287,0.134941
min,0.0,0.011905,2.0,-1.136492,-0.48395
25%,0.244825,0.2,2.0,-0.010001,-0.005001
50%,0.38,0.25,4.0,0.068111,0.05
75%,0.5,0.5,5.0,0.211623,0.137137
max,0.864865,0.574172,84.0,0.72973,0.426852
