This notebook is to get some basic statistics about the distribution of the suggestions generated by the models.

In [None]:
import collections
from collections import Counter
from pprint import pprint

from evaluation.utils import (
    get_all_suggestions,
    get_annotations,
    get_experiment_infos,
)
from experiments.models import ALL_MODELS
from experiments.utils import REPO_ROOT

experiments = get_experiment_infos()
experiments

## count suggestion types by experiment and model

In [None]:
def count_suggestions_by_model(experiment_id):
    all_suggestions = get_all_suggestions(experiment_id)
    model_suggestion_counts = Counter()
    for suggestion in all_suggestions:
        model_suggestion_counts[suggestion.model_key] += 1
    return model_suggestion_counts


def count_suggestions_by_type(experiment_id):
    all_suggestions = get_all_suggestions(experiment_id)
    suggestion_type_counts = Counter()
    for suggestion in all_suggestions:
        suggestion_type_counts[suggestion.suggestion["suggest_type"]] += 1
    return suggestion_type_counts


def count_suggestions_by_model_and_type(experiment_id):
    all_suggestions = get_all_suggestions(experiment_id)
    counts = collections.defaultdict(Counter)
    for suggestion in all_suggestions:
        suggest_type = suggestion.suggestion["suggest_type"]
        if suggest_type == "foundry":
            suggest_type += f".{suggestion.suggestion['action']['type']}"
        counts[suggestion.model_key][suggest_type] += 1
    return dict(counts)

In [None]:
for experiment in experiments:
    print(experiment.name)
    pprint(count_suggestions_by_model_and_type(experiment.id))
    # print(count_suggestions_by_type(experiment.id))

## get experiment durations

In [None]:
for experiment in experiments:
    pcm_fp = REPO_ROOT / experiment.pcm_fp
    duration = pcm_fp.stat().st_size / 48000
    all_suggestions = get_all_suggestions(experiment.id)
    print(f"{experiment.name}: {duration:.2f}s, {len(all_suggestions)} suggestions")

# identify models without many unique positive annotations

This is used to filter down the number of annotations needed for the human eval

In [None]:
positive_annotations_by_model = Counter()
all_annotations_by_model = Counter()
unique_positive_annotations = 0
unique_positive_annotations_by_model = Counter()
unique_model_sets = Counter()

all_annotations = list(get_annotations("starless-lands-s17"))

for annotation in all_annotations:
    if annotation.score > 0 and annotation.why == "manual":
        unique_positive_annotations += 1

        model_set = [a for a in all_annotations if annotation.suggestion_id in a.why]

        if not model_set:
            unique_positive_annotations_by_model[annotation.entry.model_key] += 1

        unique_model_sets[frozenset([annotation.entry.model_key, *(a.entry.model_key for a in model_set)])] += 1

    all_annotations_by_model[annotation.entry.model_key] += 1

    if annotation.score > 0:
        positive_annotations_by_model[annotation.entry.model_key] += 1

In [None]:
positive_annotations_by_model

In [None]:
all_annotations_by_model

In [None]:
unique_positive_annotations_by_model

In [None]:
unique_positive_annotations

In [None]:
[k for k in ALL_MODELS if k not in unique_positive_annotations_by_model]

In [None]:
unique_model_sets