https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/005_visualization.html#sphx-glr-download-tutorial-10-key-features-005-visualization-py

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] ="expandable_segments:True" # seems to stop gpu mem from filling up despite clearing

In [3]:
import torch
import pandas as pd
from pathlib import Path
import optuna
from reprpo.hp.helpers import optuna_df

In [4]:
from reprpo.training import train
from reprpo.experiments import experiment_configs
from reprpo.hp.space import search_spaces

[2024-10-06 10:56:29,745] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


## Objective

In [5]:
SEED=42
key_metric = "acc_gain_vs_ref/oos"
torch.manual_seed(SEED)
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
# silence please
import os
from loguru import logger
logger.remove()
logger.remove()
logger.add(os.sys.stderr, level="WARNING")

os.environ["WANDB_MODE"] = "disabled"
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["TQDM_DISABLE"] = "true"

In [7]:
f_db = f"sqlite:///optuna.db"
f = f_db.replace('sqlite:///', './')
print(f)
Path(f).parent.mkdir(parents=True, exist_ok=True)
f_db

./optuna.db


'sqlite:///optuna.db'

In [8]:
# print(f'to visualise run in cli\ncd nbs\noptuna-dashboard {f_db}')

In [9]:
from reprpo.hp.target import override, default_tuner_kwargs
from reprpo.experiments import experiment_configs
import copy
import wandb

import optuna.pruners
from optuna_integration.wandb import WeightsAndBiasesCallback

In [10]:


import functools

def list2tuples(d):
    for k, v in d.items():
        if isinstance(v, list):
            d[k] = tuple(v)
    return d

def objective_func(kwargs, trial, starter_experiment_name):
    cfg = copy.deepcopy(experiment_configs[starter_experiment_name][1])
    override(cfg, default_tuner_kwargs)
    override(cfg, kwargs)
    kwargs = list2tuples(kwargs)
    r = train(cfg, trial=trial)
    return r

def objective(trial: optuna.Trial, starter_experiment_name, trial2args, key_metric=key_metric) -> float:
    kwargs = trial2args(trial)
    r = objective_func(kwargs, trial, starter_experiment_name)
    return r[key_metric]



## Opt

Note on pruning. It's only really usefull with validation metrics and for long jobs over many epochs. I've got a small proxy job so there is no need.

In [11]:
# from reprpo.experiments import experiment_configs
from reprpo.hp.space import experiment_configs
experiment_configs.keys()

dict_keys(['hs-svd-mse', 'hs-hra-rank', 'hs-ortho-prefvec', 'ether-prefvec', 'dpo', 'projbp', 'projgrad2'])

In [12]:
import warnings
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning) 

In [13]:
from optuna.study.study import storages, get_all_study_names
study_names = get_all_study_names(storage=f_db)

for study_name in study_names:
    print(study_name)
    study = optuna.load_study(study_name=study_name, storage=f_db)
    try:
        df_res = optuna_df(study, key_metric)
        display(df_res)
        print()
    except ValueError as e:
        print('-')

projgrad2


[W 2024-10-06 10:57:32,241] Study instance does not contain completed trials.


Unnamed: 0_level_0,importance,best
"projgrad2 N=✓0/309, best=nan",Unnamed: 1_level_1,Unnamed: 2_level_1



side-ether-prefvec


Unnamed: 0_level_0,importance,best
"side-ether-prefvec N=✓208/209, best=1.169",Unnamed: 1_level_1,Unnamed: 2_level_1
Htype,0.228,oft
β,0.184,0.403787
use_angle_loss,0.155,True
use_dpo_loss,0.11,False
lr,0.108,0.000615
weight_tokens,0.099,False
use_nll_loss,0.029,False
use_orth_loss,0.028,False
reduction,0.024,25
collect_input,0.019,False



projgrad


[W 2024-10-06 10:57:34,652] Study instance does not contain completed trials.


Unnamed: 0_level_0,importance,best
"projgrad N=✓0/4, best=nan",Unnamed: 1_level_1,Unnamed: 2_level_1



side-svd-mse


Unnamed: 0_level_0,importance,best
"side-svd-mse N=✓28/316, best=1.010",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.998,0.001195
α,0.001,0.635584
collect_hs,0.0,True
quantile,0.0,float
dual_svd,0.0,True
collect_input,0.0,False
quantile_value,,0.3



side-hra-rank


Unnamed: 0_level_0,importance,best
"side-hra-rank N=✓182/183, best=1.229",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.907,0.000188
collect_hs,0.041,False
apply_GS,0.024,False
collect_input,0.023,False
β,0.003,0.110393
r,0.001,2
α,0.001,5.920778



hs-ortho-prefvec


Unnamed: 0_level_0,importance,best
"hs-ortho-prefvec N=✓60/60, best=1.131",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.966,0.000239
orthogonal_map,0.008,matrix_exp
β,0.008,0.218987
use_angle_loss,0.006,False
use_proj_rel,0.005,False
weight_tokens,0.003,True
use_nll_loss,0.003,False
use_orth_loss,0.002,False
use_dpo_loss,0.0,True



projbp


Unnamed: 0_level_0,importance,best
"projbp N=✓10/64, best=1.033",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.784,0.000003
β,0.111,0.366362
scale_orth,0.094,False
reverse_pref,0.006,False
neg_slope,0.005,0
mag_clip,0.0,



dpo


Unnamed: 0_level_0,importance,best
"dpo N=✓48/50, best=1.262",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,1.0,0.000255



hs-svd-mse


Unnamed: 0_level_0,importance,best
"hs-svd-mse N=✓14/332, best=1.017",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.93,0.001195
α,0.047,0.635584
quantile,0.012,float
dual_svd,0.011,True
collect_input,0.0,False
collect_hs,0.0,True
quantile_value,,0.3



hs-hra-rank


Unnamed: 0_level_0,importance,best
"hs-hra-rank N=✓83/85, best=1.118",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.935,0.000295
β,0.039,0.441272
apply_GS,0.016,True
r,0.011,87
α,0.0,2.474218



ether-prefvec


Unnamed: 0_level_0,importance,best
"ether-prefvec N=✓57/60, best=1.183",Unnamed: 1_level_1,Unnamed: 2_level_1
lr,0.715,0.000378
nb,0.211,20
β,0.023,1.984854
flip_side,0.01,True
use_dpo_loss,0.01,False
Htype,0.008,ether
use_orth_loss,0.005,True
collect_input,0.005,True
reduction,0.005,1
use_proj_rel,0.005,True





In [14]:
# unit test
for exp_name, (N, trial2args) in search_spaces.items():
    study = optuna.create_study(direction="maximize")
    cfg = copy.deepcopy(experiment_configs[exp_name][1])
    print('exp_name', exp_name)
    for _ in range(10):
        trial = study.ask()
        kwargs = trial2args(trial)
        override(cfg, default_tuner_kwargs)
        override(cfg, kwargs)
        kwargs = list2tuples(kwargs)
        # print()
    print('kwargs', kwargs)
    print('='*100)
    print()

    # TODO get float * 50 + categories * 25

    # try:
    #     df_res = optuna_df(study, key_metric)
    #     print(df_res.to_markdown())
    # except Exception as e:
    #     print(e)

[I 2024-10-06 10:57:46,417] A new study created in memory with name: no-name-9d5a4015-b32d-47f9-9279-e1c692e9ebc2
[I 2024-10-06 10:57:46,428] A new study created in memory with name: no-name-cc3721c4-6392-406b-8214-ae56170cad1b
[I 2024-10-06 10:57:46,433] A new study created in memory with name: no-name-dec0cef1-1988-4861-89e0-6ba9de4defe4
[I 2024-10-06 10:57:46,441] A new study created in memory with name: no-name-cebcb108-9912-4c03-bb1e-a4c5bc96409e
[I 2024-10-06 10:57:46,450] A new study created in memory with name: no-name-56b592f1-1186-456f-a4f8-d426ffe5b71a
[I 2024-10-06 10:57:46,455] A new study created in memory with name: no-name-f871c272-e02f-4d29-9b9b-c8310e81865c
[I 2024-10-06 10:57:46,459] A new study created in memory with name: no-name-e529be00-f593-4a11-aae1-c66b4926419a


exp_name hs-svd-mse
kwargs {'lr': 5.219336618406165e-07, 'collect_input': True, 'collect_hs': True, 'transform.quantile': 0.7000000000000001, 'transform.dual_svd': False, 'loss.α': 1.0773710852052232}

exp_name hs-hra-rank
kwargs {'lr': 5.070775771636803e-07, 'transform.r': 7, 'transform.apply_GS': True, 'loss.α': 0.19488202953119355, 'loss.β': 0.6607606435959001}

exp_name hs-ortho-prefvec
kwargs {'lr': 2.238816238680683e-05, 'transform.orthogonal_map': 'householder', 'loss.β': 0.10579611078226271, 'loss.use_orth_loss': False, 'loss.use_angle_loss': True, 'loss.use_dpo_loss': True, 'loss.use_nll_loss': True, 'loss.weight_tokens': False, 'loss.use_proj_rel': True}

exp_name ether-prefvec
kwargs {'lr': 9.484849416559945e-07, 'collect_input': False, 'collect_hs': True, 'transform.nb': 1, 'transform.Htype': 'etherplus', 'transform.flip_side': False, 'transform.reduction': 9, 'loss.β': 2.2448768371311506e-05, 'loss.use_orth_loss': True, 'loss.use_angle_loss': False, 'loss.use_dpo_loss': Fa

In [15]:
# from optuna import trial
# t = trial.create_trial(value=1)
# t.suggest_categorical("a", [1, 2, 3])

In [16]:
MAX_TRIALS= 250
import numpy as np
spaces = list(search_spaces.items())
while True:
    np.random.shuffle(spaces)
    for exp_name, (max_trials, trial2args) in spaces:
        try:
            study_name = f"{exp_name}"
            study = optuna.create_study(
                study_name=study_name,
                direction="maximize",
                load_if_exists=True,
                storage=f_db,
                sampler=optuna.samplers.TPESampler(seed=SEED),
                # pruner=optuna.pruners.NopPruner(),
            )

            n = 0
            try:
                df = study.trials_dataframe().sort_values('value', ascending=False)
                n = len(df)
            except Exception as e:
                print(e)
                pass
            if n>0:
                print(f"loaded {n} {study_name} trials")

                df_res = optuna_df(study, key_metric)
                print(df_res.to_markdown())

            
            if n < max_trials:
                _objective = functools.partial(objective, key_metric=key_metric, starter_experiment_name=exp_name, trial2args=trial2args)

                study.optimize(_objective, 
                            n_trials=20, # do 20 at a time, round robin, untill done
                            gc_after_trial=True, 
                            catch=(AssertionError, OSError, RuntimeError, KeyError, torch.OutOfMemoryError)
                )

            print('='*80)
        except KeyboardInterrupt:
            break
        except Exception as e:
            logger.exception(e)

[I 2024-10-06 10:57:54,328] Using an existing study with name 'ether-prefvec' instead of creating a new one.


loaded 60 ether-prefvec trials
| ether-prefvec N=✓57/60, best=1.183   |   importance | best                   |
|:-------------------------------------|-------------:|:-----------------------|
| lr                                   |        0.773 | 0.00037772770210724844 |
| nb                                   |        0.171 | 20                     |
| β                                    |        0.016 | 1.9848539330526844     |
| use_proj_rel                         |        0.014 | True                   |
| use_dpo_loss                         |        0.006 | False                  |
| Htype                                |        0.006 | ether                  |
| reduction                            |        0.005 | 1                      |
| flip_side                            |        0.004 | True                   |
| use_nll_loss                         |        0.003 | True                   |
| use_orth_loss                        |        0.002 | True                  

[I 2024-10-06 11:01:20,835] Trial 60 finished with value: 1.0289017341040463 and parameters: {'lr': 5.406125447886827e-05, 'collect_input': True, 'collect_hs': False, 'nb': 16, 'Htype': 'oft', 'flip_side': True, 'reduction': 2, 'β': 1.1929281086634211, 'use_orth_loss': True, 'use_angle_loss': True, 'use_dpo_loss': False, 'use_nll_loss': True, 'weight_tokens': True, 'use_proj_rel': True}. Best is trial 24 with value: 1.183044315992293.



| acc_inc/eval_ds [pp]                                                                                                                                                                                                                  |   train |   test |   oos |   rnd |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|-------:|------:|------:|
| ReprPO collect_input=True prefvec.use_nll_prefvec=True prefvec.use_orth_prefvec=True prefvec.use_proj_rel=True prefvec.weight_tokens=True prefvec.β=1.2 lr=5.4e-05 ether.Htype=oft ether.flip_side=True ether.nb=16 ether.reduction=2 |       0 |      0 |  2.89 | 1.754 |


[I 2024-10-06 11:05:07,417] Trial 61 finished with value: 1.0077071290944124 and parameters: {'lr': 5.159171545826072e-06, 'collect_input': False, 'collect_hs': False, 'nb': 23, 'Htype': 'ether', 'flip_side': True, 'reduction': 3, 'β': 0.2742640490197389, 'use_orth_loss': True, 'use_angle_loss': True, 'use_dpo_loss': True, 'use_nll_loss': True, 'weight_tokens': True, 'use_proj_rel': True}. Best is trial 24 with value: 1.183044315992293.



| acc_inc/eval_ds [pp]                                                                                                                                                                                                                               |   train |   test |   oos |   rnd |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|-------:|------:|------:|
| ReprPO prefvec.use_dpo_prefvec=True prefvec.use_nll_prefvec=True prefvec.use_orth_prefvec=True prefvec.use_proj_rel=True prefvec.weight_tokens=True prefvec.β=0.27 lr=5.2e-06 ether.Htype=ether ether.flip_side=True ether.nb=23 ether.reduction=3 |   0.826 |      0 | 0.771 | 1.754 |


[I 2024-10-06 11:08:31,162] Trial 62 finished with value: 1.0732177263969174 and parameters: {'lr': 0.0002749403786470217, 'collect_input': True, 'collect_hs': False, 'nb': 28, 'Htype': 'ether', 'flip_side': True, 'reduction': 4, 'β': 0.03463337884079901, 'use_orth_loss': True, 'use_angle_loss': True, 'use_dpo_loss': False, 'use_nll_loss': True, 'weight_tokens': True, 'use_proj_rel': False}. Best is trial 24 with value: 1.183044315992293.



| acc_inc/eval_ds [pp]                                                                                                                                                                                            |   train |   test |   oos |    rnd |
|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|-------:|------:|-------:|
| ReprPO collect_input=True prefvec.use_nll_prefvec=True prefvec.use_orth_prefvec=True prefvec.weight_tokens=True prefvec.β=0.035 lr=0.00027 ether.Htype=ether ether.flip_side=True ether.nb=28 ether.reduction=4 |   0.826 |  0.787 | 7.322 | -3.509 |


[I 2024-10-06 11:11:57,071] Trial 63 finished with value: 1.1445086705202314 and parameters: {'lr': 0.0008969785497959995, 'collect_input': True, 'collect_hs': False, 'nb': 13, 'Htype': 'etherplus', 'flip_side': True, 'reduction': 7, 'β': 1.2567696475571681, 'use_orth_loss': True, 'use_angle_loss': True, 'use_dpo_loss': False, 'use_nll_loss': False, 'weight_tokens': False, 'use_proj_rel': True}. Best is trial 24 with value: 1.183044315992293.



| acc_inc/eval_ds [pp]                                                                                                                                                               |   train |   test |    oos |   rnd |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|-------:|-------:|------:|
| ReprPO collect_input=True prefvec.use_orth_prefvec=True prefvec.use_proj_rel=True prefvec.β=1.3 lr=0.0009 ether.Htype=etherplus ether.flip_side=True ether.nb=13 ether.reduction=7 |       0 |      0 | 14.451 | 5.263 |


[I 2024-10-06 11:15:20,230] Trial 64 finished with value: 1.1734104046242777 and parameters: {'lr': 0.0003563800120661128, 'collect_input': True, 'collect_hs': False, 'nb': 20, 'Htype': 'ether', 'flip_side': True, 'reduction': 1, 'β': 1.805807579925217, 'use_orth_loss': True, 'use_angle_loss': True, 'use_dpo_loss': False, 'use_nll_loss': True, 'weight_tokens': True, 'use_proj_rel': True}. Best is trial 24 with value: 1.183044315992293.



| acc_inc/eval_ds [pp]                                                                                                                                                                                                                    |   train |   test |    oos |    rnd |
|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|-------:|-------:|-------:|
| ReprPO collect_input=True prefvec.use_nll_prefvec=True prefvec.use_orth_prefvec=True prefvec.use_proj_rel=True prefvec.weight_tokens=True prefvec.β=1.8 lr=0.00036 ether.Htype=ether ether.flip_side=True ether.nb=20 ether.reduction=1 |   2.479 |      0 | 17.341 | -7.018 |


## plot

In [None]:
# You can use Matplotlib instead of Plotly for visualization by simply replacing `optuna.visualization` with
# `optuna.visualization.matplotlib` in the following examples.
from optuna.visualization.matplotlib import plot_contour
from optuna.visualization.matplotlib import plot_edf
from optuna.visualization.matplotlib import plot_intermediate_values
from optuna.visualization.matplotlib import plot_optimization_history
from optuna.visualization.matplotlib import plot_parallel_coordinate
from optuna.visualization.matplotlib import plot_param_importances
from optuna.visualization.matplotlib import plot_rank
from optuna.visualization.matplotlib import plot_slice
from optuna.visualization.matplotlib import plot_timeline

In [None]:
search_spaces.keys()

In [None]:
exp_name = 'projgrad'
trial2args = search_spaces[exp_name]

study_name = f"{exp_name}"
study = optuna.create_study(
    study_name=study_name,
    direction="maximize",
    load_if_exists=True,
    storage=f_db,
    sampler=optuna.samplers.TPESampler(seed=SEED),
    pruner=optuna.pruners.NopPruner(),
)
print('study.best_trial', study.best_trial)
df = study.trials_dataframe().query('state == "COMPLETE"').sort_values('value', ascending=False)
print(len(df))
plot_optimization_history(study)

In [13]:
# plot_timeline(study)

In [29]:
# plot_intermediate_values(study)

In [30]:
# plot_contour(study)


In [None]:
plot_slice(study)


In [None]:
plot_param_importances(study)

In [None]:
plot_intermediate_values(study)

In [None]:
plot_parallel_coordinate(study)

### Apendix 1: dataclass 2 optuna

In [20]:
# import inspect
# import typing
# from typing import Literal

# def optuna_suggest_from_dataclass(t):
#     n = t.__name__
#     print(f'## {n}')
#     sig = inspect.signature(t)
#     for name, param in sig.parameters.items():
#         if param.annotation== bool:
#             print(f'"{name}": trial.suggest_categorical("{name}", [True, False]),')
#         elif param.annotation==int:
#             print(f'"{name}": trial.suggest_int("{name}", 1, 10),')
#         elif param.annotation ==float:
#             print(f'"{name}": trial.suggest_float("{name}", 0.1, 10.0),')
#         elif param.annotation == str:
#             print(f'"{name}": trial.suggest_categorical("{name}", ["a", "b", "c"]),')
#         elif param.annotation == tuple:
#             print(f'"{name}": trial.suggest_categorical("{name}", [(1, 2), (3, 4), (5, 6)]),')
#         elif typing.get_origin(param.annotation) == Literal:
#             print(f'"{name}": trial.suggest_categorical("{name}", {param.annotation.__args__}),')
#         else:
#             print(f"!!Unknown type {param}")
#             # print(name, param.default, param.annotation)

# optuna_suggest_from_dataclass(ReprPOConfig)
# for t in Transforms:
#     print(f'## {t}')
#     optuna_suggest_from_dataclass(t.value)
# for l in Losses:
#     print(f'## {l}')
#     optuna_suggest_from_dataclass(l.value)


# optuna_suggest_from_dataclass(DPOProjGradConfig)