https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/005_visualization.html#sphx-glr-download-tutorial-10-key-features-005-visualization-py

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] ="expandable_segments:True" # seems to stop gpu mem from filling up despite clearing

In [3]:
import torch
import pandas as pd
from pathlib import Path
import optuna
from reprpo.hp.helpers import optuna_df

In [4]:
from reprpo.training import train
from reprpo.experiments import experiment_configs
from reprpo.hp.space import search_spaces

[2024-10-06 00:53:29,043] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


## Objective

In [5]:
SEED=42
key_metric = "acc_gain_vs_ref/oos"
torch.manual_seed(SEED)
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
# silence please
import os
from loguru import logger
logger.remove()
logger.remove()
logger.add(os.sys.stderr, level="WARNING")

os.environ["WANDB_MODE"] = "disabled"
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["TQDM_DISABLE"] = "true"

In [7]:
f_db = f"sqlite:///optuna.db"
f = f_db.replace('sqlite:///', './')
print(f)
Path(f).parent.mkdir(parents=True, exist_ok=True)
f_db

./optuna.db


'sqlite:///optuna.db'

In [8]:
# print(f'to visualise run in cli\ncd nbs\noptuna-dashboard {f_db}')

In [9]:
from reprpo.hp.target import override, default_tuner_kwargs
from reprpo.experiments import experiment_configs
import copy
import wandb

import optuna.pruners
from optuna_integration.wandb import WeightsAndBiasesCallback

In [10]:


import functools

def list2tuples(d):
    for k, v in d.items():
        if isinstance(v, list):
            d[k] = tuple(v)
    return d

def objective_func(kwargs, trial, starter_experiment_name):
    cfg = copy.deepcopy(experiment_configs[starter_experiment_name][1])
    override(cfg, default_tuner_kwargs)
    override(cfg, kwargs)
    kwargs = list2tuples(kwargs)
    r = train(cfg, trial=trial)
    return r

def objective(trial: optuna.Trial, starter_experiment_name, trial2args, key_metric=key_metric) -> float:
    kwargs = trial2args(trial)
    r = objective_func(kwargs, trial, starter_experiment_name)
    return r[key_metric]



## Opt

Note on pruning. It's only really usefull with validation metrics and for long jobs over many epochs. I've got a small proxy job so there is no need.

In [11]:
# from reprpo.experiments import experiment_configs
from reprpo.hp.space import experiment_configs
experiment_configs.keys()

dict_keys(['hs-svd-mse', 'hs-hra-rank', 'hs-ortho-prefvec', 'ether-prefvec', 'dpo', 'projbp', 'projgrad2'])

In [12]:
import warnings
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning) 

In [13]:
from optuna.study.study import storages, get_all_study_names
study_names = get_all_study_names(storage=f_db)

for study_name in study_names:
    print(study_name)
    study = optuna.load_study(study_name=study_name, storage=f_db)
    try:
        df_res = optuna_df(study, key_metric)
        display(df_res)
        print()
    except ValueError as e:
        print('-')

projgrad2


[W 2024-10-06 00:54:26,331] Study instance does not contain completed trials.


Unnamed: 0_level_0,importance,best
"projgrad2 N=✓0/✖307, best=nan",Unnamed: 1_level_1,Unnamed: 2_level_1



side-ether-prefvec


Unnamed: 0_level_0,importance,best
"side-ether-prefvec N=✓208/✖209, best=1.169",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.368,0.000615
nb,0.219,30
β,0.218,0.403787
reduction,0.146,25
flip_side,0.014,True
use_dpo_loss,0.013,False
collect_hs,0.011,False
Htype,0.01,oft
use_nll_loss,0.0,False
weight_tokens,0.0,False


[W 2024-10-06 00:54:26,974] Study instance does not contain completed trials.



projgrad


Unnamed: 0_level_0,importance,best
"projgrad N=✓0/✖4, best=nan",Unnamed: 1_level_1,Unnamed: 2_level_1



side-svd-mse


Unnamed: 0_level_0,importance,best
"side-svd-mse N=✓28/✖316, best=1.010",Unnamed: 1_level_1,Unnamed: 2_level_1
α,0.844,0.635584
lr,0.126,0.001195
quantile,0.016,float
collect_hs,0.005,True
collect_input,0.005,False
dual_svd,0.005,True
quantile_value,,0.3



side-hra-rank


Unnamed: 0_level_0,importance,best
"side-hra-rank N=✓182/✖183, best=1.229",Unnamed: 1_level_1,Unnamed: 2_level_1
β,0.441,0.110393
lr,0.417,0.000188
α,0.095,5.920778
r,0.03,2
apply_GS,0.017,False
collect_hs,0.0,False
collect_input,0.0,False



hs-ortho-prefvec


Unnamed: 0_level_0,importance,best
"hs-ortho-prefvec N=✓20/✖20, best=1.118",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.782,0.000125
β,0.161,0.341233
use_nll_loss,0.019,False
use_proj_rel,0.019,True
use_angle_loss,0.005,True
use_dpo_loss,0.005,True
weight_tokens,0.005,True
orthogonal_map,0.004,matrix_exp
use_orth_loss,0.001,False



projbp


Unnamed: 0_level_0,importance,best
"projbp N=✓10/✖24, best=1.033",Unnamed: 1_level_1,Unnamed: 2_level_1
β,0.512,0.366362
lr,0.311,0.000003
scale_orth,0.128,False
mag_clip,0.021,
neg_slope,0.014,0
reverse_pref,0.014,False



dpo


Unnamed: 0_level_0,importance,best
"dpo N=✓8/✖10, best=1.087",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,1.0,9.8e-05



hs-svd-mse


Unnamed: 0_level_0,importance,best
"hs-svd-mse N=✓14/✖332, best=1.017",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.752,0.001195
α,0.189,0.635584
collect_input,0.056,False
collect_hs,0.003,True
dual_svd,0.0,True
quantile,0.0,float
quantile_value,,0.3



hs-hra-rank


Unnamed: 0_level_0,importance,best
"hs-hra-rank N=✓43/✖45, best=1.087",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.907,0.000158
r,0.048,96
β,0.039,18.156422
α,0.005,0.123722
apply_GS,0.002,False



ether-prefvec


Unnamed: 0_level_0,importance,best
ether-prefvec N=3,Unnamed: 1_level_1,Unnamed: 2_level_1





In [15]:
# unit test
for exp_name, (N, trial2args) in search_spaces.items():
    study = optuna.create_study(direction="maximize")
    cfg = copy.deepcopy(experiment_configs[exp_name][1])
    print('exp_name', exp_name)
    for _ in range(10):
        trial = study.ask()
        kwargs = trial2args(trial)
        override(cfg, default_tuner_kwargs)
        override(cfg, kwargs)
        kwargs = list2tuples(kwargs)
        # print()
    print('kwargs', kwargs)
    print('='*100)
    print()

    # TODO get float * 50 + categories * 25

    # try:
    #     df_res = optuna_df(study, key_metric)
    #     print(df_res.to_markdown())
    # except Exception as e:
    #     print(e)

[I 2024-10-06 00:54:53,760] A new study created in memory with name: no-name-34dae79d-d0e6-4f47-b94d-61fbacd30677
[I 2024-10-06 00:54:53,774] A new study created in memory with name: no-name-03940010-97ac-4136-b4e1-17ddfa657803
[I 2024-10-06 00:54:53,784] A new study created in memory with name: no-name-737d0f6c-4539-461f-99c2-1b11bbd028ad
[I 2024-10-06 00:54:53,796] A new study created in memory with name: no-name-6dcda2eb-c085-43bf-bca4-4c2a297c6bc9
[I 2024-10-06 00:54:53,816] A new study created in memory with name: no-name-966d6188-a1b6-4075-a088-bd42eb6177cd
[I 2024-10-06 00:54:53,824] A new study created in memory with name: no-name-811f712a-bc11-4397-9287-334814c2b657
[I 2024-10-06 00:54:53,831] A new study created in memory with name: no-name-578668dd-46e8-43c1-bf2d-6ae8761b5a19


exp_name hs-svd-mse
kwargs {'lr': 2.4008970673824455e-05, 'collect_input': True, 'collect_hs': True, 'transform.quantile': 1, 'transform.dual_svd': True, 'loss.α': 4.659412764403834}

exp_name hs-hra-rank
kwargs {'lr': 1.0639442660721646e-06, 'transform.r': 37, 'transform.apply_GS': False, 'loss.α': 2606.3940587280713, 'loss.β': 49.21176557089369}

exp_name hs-ortho-prefvec
kwargs {'lr': 0.006940820649722095, 'transform.orthogonal_map': 'cayley', 'loss.β': 0.00036668618914569695, 'loss.use_orth_loss': False, 'loss.use_angle_loss': False, 'loss.use_dpo_loss': False, 'loss.use_nll_loss': True, 'loss.weight_tokens': False, 'loss.use_proj_rel': True}

exp_name ether-prefvec
kwargs {'lr': 0.005718612175996021, 'collect_input': False, 'collect_hs': True, 'transform.nb': 4, 'transform.Htype': 'oft', 'transform.flip_side': False, 'transform.reduction': 10, 'loss.β': 0.015625783471675713, 'loss.use_orth_loss': False, 'loss.use_angle_loss': True, 'loss.use_dpo_loss': True, 'loss.use_nll_loss': F

In [16]:
# from optuna import trial
# t = trial.create_trial(value=1)
# t.suggest_categorical("a", [1, 2, 3])

In [17]:
MAX_TRIALS= 250
import numpy as np
spaces = list(search_spaces.items())
while True:
    np.random.shuffle(spaces)
    for exp_name, (max_trials, trial2args) in spaces:
        try:
            study_name = f"{exp_name}"
            study = optuna.create_study(
                study_name=study_name,
                direction="maximize",
                load_if_exists=True,
                storage=f_db,
                sampler=optuna.samplers.TPESampler(seed=SEED),
                # pruner=optuna.pruners.NopPruner(),
            )

            n = 0
            try:
                df = study.trials_dataframe().sort_values('value', ascending=False)
                n = len(df)
            except Exception as e:
                print(e)
                pass
            if n>0:
                print(f"loaded {n} {study_name} trials")

                df_res = optuna_df(study, key_metric)
                print(df_res.to_markdown())

            
            if n < max_trials:
                _objective = functools.partial(objective, key_metric=key_metric, starter_experiment_name=exp_name, trial2args=trial2args)

                study.optimize(_objective, 
                            n_trials=20, # do 20 at a time, round robin, untill done
                            gc_after_trial=True, 
                            catch=(AssertionError, OSError, RuntimeError, KeyError, torch.OutOfMemoryError)
                )

            print('='*80)
        except KeyboardInterrupt:
            break
        except Exception as e:
            logger.exception(e)

[I 2024-10-06 00:55:01,659] Using an existing study with name 'ether-prefvec' instead of creating a new one.


loaded 3 ether-prefvec trials
| ether-prefvec N=3   | importance   | best   |
|---------------------|--------------|--------|


[I 2024-10-06 00:58:49,858] Trial 3 finished with value: 1.0096339113680155 and parameters: {'lr': 7.45934328572655e-06, 'collect_input': True, 'collect_hs': True, 'nb': 1, 'Htype': 'etherplus', 'flip_side': False, 'reduction': 160, 'β': 2.177484667394932e-05, 'use_orth_loss': False, 'use_angle_loss': False, 'use_dpo_loss': True, 'use_nll_loss': True, 'weight_tokens': False, 'use_proj_rel': False}. Best is trial 0 with value: 1.0096339113680155.



| acc_inc/eval_ds [pp]                                                                                                                                                                                                  |   train |   test |   oos |   rnd |
|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|-------:|------:|------:|
| ReprPO collect_hs=True collect_input=True prefvec.use_angle_prefvec=False prefvec.use_dpo_prefvec=True prefvec.use_nll_prefvec=True prefvec.β=2.2e-05 lr=7.5e-06 ether.Htype=etherplus ether.nb=1 ether.reduction=160 |   0.826 |      0 | 0.963 | 3.509 |


[I 2024-10-06 01:02:30,339] Trial 4 finished with value: 1.0 and parameters: {'lr': 9.96251322205511e-07, 'collect_input': False, 'collect_hs': False, 'nb': 1, 'Htype': 'oft', 'flip_side': True, 'reduction': 57, 'β': 0.000593490901937937, 'use_orth_loss': False, 'use_angle_loss': False, 'use_dpo_loss': False, 'use_nll_loss': False, 'weight_tokens': True, 'use_proj_rel': True}. Best is trial 0 with value: 1.0096339113680155.



| acc_inc/eval_ds [pp]                                                                                                                                                                      |   train |   test |   oos |    rnd |
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|-------:|------:|-------:|
| ReprPO prefvec.use_angle_prefvec=False prefvec.use_proj_rel=True prefvec.weight_tokens=True prefvec.β=0.00059 lr=1e-06 ether.Htype=oft ether.flip_side=True ether.nb=1 ether.reduction=57 |       0 |      0 |     0 | -3.509 |


In [None]:
study.optimize(_objective, 
            n_trials=20, 
            gc_after_trial=True, 
            catch=(AssertionError, OSError, RuntimeError, KeyError, torch.OutOfMemoryError)
)

In [None]:
print(wandb.run.get_url())

## plot

In [None]:
# You can use Matplotlib instead of Plotly for visualization by simply replacing `optuna.visualization` with
# `optuna.visualization.matplotlib` in the following examples.
from optuna.visualization.matplotlib import plot_contour
from optuna.visualization.matplotlib import plot_edf
from optuna.visualization.matplotlib import plot_intermediate_values
from optuna.visualization.matplotlib import plot_optimization_history
from optuna.visualization.matplotlib import plot_parallel_coordinate
from optuna.visualization.matplotlib import plot_param_importances
from optuna.visualization.matplotlib import plot_rank
from optuna.visualization.matplotlib import plot_slice
from optuna.visualization.matplotlib import plot_timeline

In [None]:
search_spaces.keys()

In [None]:
exp_name = 'projgrad'
trial2args = search_spaces[exp_name]

study_name = f"{exp_name}"
study = optuna.create_study(
    study_name=study_name,
    direction="maximize",
    load_if_exists=True,
    storage=f_db,
    sampler=optuna.samplers.TPESampler(seed=SEED),
    pruner=optuna.pruners.NopPruner(),
)
print('study.best_trial', study.best_trial)
df = study.trials_dataframe().query('state == "COMPLETE"').sort_values('value', ascending=False)
print(len(df))
plot_optimization_history(study)

In [13]:
# plot_timeline(study)

In [29]:
# plot_intermediate_values(study)

In [30]:
# plot_contour(study)


In [None]:
plot_slice(study)


In [None]:
plot_param_importances(study)

In [None]:
plot_intermediate_values(study)

In [None]:
plot_parallel_coordinate(study)

### Apendix 1: dataclass 2 optuna

In [20]:
# import inspect
# import typing
# from typing import Literal

# def optuna_suggest_from_dataclass(t):
#     n = t.__name__
#     print(f'## {n}')
#     sig = inspect.signature(t)
#     for name, param in sig.parameters.items():
#         if param.annotation== bool:
#             print(f'"{name}": trial.suggest_categorical("{name}", [True, False]),')
#         elif param.annotation==int:
#             print(f'"{name}": trial.suggest_int("{name}", 1, 10),')
#         elif param.annotation ==float:
#             print(f'"{name}": trial.suggest_float("{name}", 0.1, 10.0),')
#         elif param.annotation == str:
#             print(f'"{name}": trial.suggest_categorical("{name}", ["a", "b", "c"]),')
#         elif param.annotation == tuple:
#             print(f'"{name}": trial.suggest_categorical("{name}", [(1, 2), (3, 4), (5, 6)]),')
#         elif typing.get_origin(param.annotation) == Literal:
#             print(f'"{name}": trial.suggest_categorical("{name}", {param.annotation.__args__}),')
#         else:
#             print(f"!!Unknown type {param}")
#             # print(name, param.default, param.annotation)

# optuna_suggest_from_dataclass(ReprPOConfig)
# for t in Transforms:
#     print(f'## {t}')
#     optuna_suggest_from_dataclass(t.value)
# for l in Losses:
#     print(f'## {l}')
#     optuna_suggest_from_dataclass(l.value)


# optuna_suggest_from_dataclass(DPOProjGradConfig)