In [None]:
%load_ext autoreload
%autoreload 2
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

In [None]:
from typing import *
from dataclasses import dataclass
import shutil

import pandas as pd

import lass.train
import lass.test
import lass.datasets
from lass.log_handling import PaperTasks, LogIssues, LogLoaderArgs, LogLoader

import transformers

transformers.logging.set_verbosity_warning() # type: ignore

## Find non-empty tasks


In [None]:
loader_args = LogLoaderArgs(
    logdir="../artifacts/logs",
    tasks='paper-full',
    model_families=["BIG-G T=0"],
    model_sizes=["128b"],
    shots=[0], #### CHANGE
    query_types=["multiple_choice"],
)

loader = LogLoader(loader_args)
data = lass.datasets.to_dataframe(loader)
nonempty_tasks = data.task.unique().tolist()
non_empty_tasks = [task for task in PaperTasks.full() if task in nonempty_tasks]
assert "ascii_word_recognition" not in non_empty_tasks

In [None]:
@dataclass
class Architecture():
    name: str
    name_short: str
    batch_size: int
    gradient_accumulation_steps: int

architecture = Architecture(
    name="microsoft/deberta-v3-large", ##### CHANGE
    name_short="deberta-large", #### CHANGE
    batch_size=4, #### CHANGE
    gradient_accumulation_steps=8, #### CHANGE
)

MODEL_SIZES = {
    # "2m": 2098048,
    # "16m": 16780288,
    # "53m": 56629632,
    # "125m": 134228992,
    # "244m": 262161280,
    # "422m": 453009408,
    # "1b": 1073784832,
    "2b": 2097218560,
    "4b": 3623973888,
    "8b": 8590102528,
    "27b": 28991404032,
    "128b": 137440272384,
}

## Train An Assessor Model for All Different BIG-G Sizes

In [None]:
for size, size_precise in MODEL_SIZES.items():
    ### CHANGE
    data_args = LogLoaderArgs(
        logdir="../artifacts/logs",
        tasks='paper-full',
        model_families=["BIG-G T=0"],
        model_sizes=[size],
        shots=[0],
        query_types=["multiple_choice"],
    )

    model = lass.train.train(
        data_args=data_args,
        group="scale-relation-0shot-large", ##### CHANGE
        split="instance",
        model_name=architecture.name,
        model_name_short=f"{architecture.name_short}-for-{size}",
        batch_size=architecture.batch_size,
        gradient_accumulation_steps=architecture.gradient_accumulation_steps,
        include_model_in_input=False,
        include_n_targets_in_input=False,
        output_dir="scaling-0shot-large", ##### CHANGE
        n_epochs=6,
        extra_training_args={
            "warmup_steps": 3000,
            "learning_rate": 2e-5,
        },
        # is_test_run=True,
    )

    results = {}

    # Metrics in total
    results_ = lass.test.test(
            data_args=data_args,
            split = 'instance',
            model_loc=model,
            model_name=architecture.name,
            max_sequence_length = 512,
    )
    results['_total'] = results_['metrics']
    results['_total']['count'] = len(results_['test'])

    print("Tested on everything")

    # Metrics per task
    for task in non_empty_tasks:
        task_data_args = LogLoaderArgs(**(loader_args.__dict__ | {'tasks': [task]}))
        results_ = lass.test.test(
            data_args=task_data_args,
            split = 'instance',
            model_loc=model,
            model_name=architecture.name,
            max_sequence_length = 512,
        )
        results[task] = results_['metrics']
        results[task]['count'] = len(results_['test'])

        df = pd.DataFrame.from_dict(results, orient='index')
        df.to_csv(f"scaling-0shot-large/{size}.csv") ################### CHANGE
        print(f"Tested on {task}")

    print("Tested on all tasks")
