In [1]:
import glob
import itertools
import os
import yaml
from copy import deepcopy

import numpy as np


In [2]:
# CONFIG_FNAME = "dann.yml"
MODEL_NAME = "DANN"
CONFIG_DIR = "configs/generated_pdac"
CONFIG_FNAME_PREFIX = "gen_pdac"

In [3]:
config = {
    "lib_params": {},
    "data_params": {},
    "train_params": {},
    "model_params": {},
}

config["data_params"]["all_genes"] = False
config["data_params"]["data_dir"] = "data"
config["data_params"]["dset"] = "pdac"
config["data_params"]["n_spots"] = 100000
config["data_params"]["one_model"] = True
config["data_params"]["samp_split"] = False
config["data_params"]["sc_id"] = "CA001063"
config["data_params"]["scaler_name"] = "standard"
config["data_params"]["st_id"] = "GSE111672"
config["data_params"]["st_split"] = False

config["model_params"]["dann_kwargs"] = {}
config["model_params"]["dann_kwargs"]["enc_out_act"] = None

config["train_params"]["adv_opt_kwargs"] = {}
config["train_params"]["adv_opt_kwargs"]["betas"] = (0.5, 0.999)
config["train_params"]["adv_opt_kwargs"]["eps"] = 1.0e-07

config["train_params"]["early_stop_crit"] = 100
config["train_params"]["epochs"] = 500
config["train_params"]["early_stop_crit_adv"] = 500
config["train_params"]["min_epochs"] = 40
config["train_params"]["min_epochs_adv"] = 100
config["train_params"]["pretraining"] = False
config["train_params"]["reverse_val"] = False


config["train_params"]["plateau_kwargs"] = {}
config["train_params"]["plateau_kwargs"]["factor"] = 0.5
config["train_params"]["plateau_kwargs"]["min_lr"] = 2.0e-06
config["train_params"]["plateau_kwargs"]["patience"] = 50
config["train_params"]["plateau_kwargs"]["verbose"] = True


if not os.path.exists(os.path.join(CONFIG_DIR, MODEL_NAME)):
    os.makedirs(os.path.join(CONFIG_DIR, MODEL_NAME))

with open(os.path.join(CONFIG_DIR, MODEL_NAME, f"{CONFIG_FNAME_PREFIX}.yml"), "w") as f:
    yaml.safe_dump(config, f)

In [4]:
## ADDA

# data_params
data_params_lists = dict(
    n_markers=[20, 40, 80],
    n_mix=[30, 50, 70],
    # n_spots=[1000, 2000, 5000, 10000],
)

# model_params
model_params_lists = dict(
    alpha_=[1, 2, 3],
    dropout=[0.1, 0.2, 0.5],
    emb_dim=[32, 64],
    enc_hidden_layer_sizes=[
        (512,),
        (1024,),
        (512, 512),
        (512, 256),
        (256, 128),
    ],
    dis_dropout_factor=[0.5, 1],
)
# train_params
train_params_lists = {
    "batch_size": [128, 256, 512],
    "lambda": [0.5, 1, 2],
    "two_step": [True, False],
    "source_first": [True, False],
}

adv_opt_kwargs_lists = dict(
    lr=[2e-3, 1e-3, 2e-4, 1e-4, 1e-5, 2e-5],
    weight_decay=[0.01, 0.1],
)

In [5]:
total_configs = 1
for value in data_params_lists.values():
    total_configs *= len(value)
for value in model_params_lists.values():
    total_configs *= len(value)
for value in train_params_lists.values():
    total_configs *= len(value)
for value in adv_opt_kwargs_lists.values():
    total_configs *= len(value)
total_configs

699840

In [6]:
config

{'lib_params': {},
 'data_params': {'all_genes': False,
  'data_dir': 'data',
  'dset': 'pdac',
  'n_spots': 100000,
  'one_model': True,
  'samp_split': False,
  'sc_id': 'CA001063',
  'scaler_name': 'standard',
  'st_id': 'GSE111672',
  'st_split': False},
 'train_params': {'adv_opt_kwargs': {'betas': (0.5, 0.999), 'eps': 1e-07},
  'early_stop_crit': 100,
  'epochs': 500,
  'early_stop_crit_adv': 500,
  'min_epochs': 40,
  'min_epochs_adv': 100,
  'pretraining': False,
  'reverse_val': False,
  'plateau_kwargs': {'factor': 0.5,
   'min_lr': 2e-06,
   'patience': 50,
   'verbose': True}},
 'model_params': {'dann_kwargs': {'enc_out_act': None}}}

In [7]:
rng = np.random.default_rng(357935)

yes_samples = set(rng.choice(total_configs, size=1000, replace=False))


data_params_l = []
for kv_tuples in itertools.product(
    *[[(k, v) for v in vlist] for k, vlist in data_params_lists.items()]
):
    data_params_l.append(dict(kv_tuples))

model_params_l = []
for kv_tuples in itertools.product(
    *[[(k, v) for v in vlist] for k, vlist in model_params_lists.items()]
):
    model_params_l.append(dict(kv_tuples))

train_params_l = []
for kv_tuples in itertools.product(
    *[[(k, v) for v in vlist] for k, vlist in train_params_lists.items()]
):
    train_params_l.append(dict(kv_tuples))

adv_opt_kwargs_l = []
for kv_tuples in itertools.product(
    *[[(k, v) for v in vlist] for k, vlist in adv_opt_kwargs_lists.items()]
):
    adv_opt_kwargs_l.append(dict(kv_tuples))


for i, (data_params, model_params, train_params, adv_opt_kwargs) in enumerate(
    itertools.product(data_params_l, model_params_l, train_params_l, adv_opt_kwargs_l)
):
    if i not in yes_samples:
        continue

    new_config = deepcopy(config)

    new_config["data_params"].update(data_params)
    new_config["model_params"]["dann_kwargs"].update(model_params)
    new_config["train_params"].update(train_params)
    new_config["train_params"]["adv_opt_kwargs"].update(adv_opt_kwargs)

    new_config["lib_params"]["manual_seed"] = int(rng.integers(0, 2**32))

    version = f"{CONFIG_FNAME_PREFIX}-{i}"
    new_config["model_params"]["model_version"] = version

    with open(os.path.join(CONFIG_DIR, MODEL_NAME, f"{version}.yml"), "w") as f:
        yaml.safe_dump(new_config, f)

In [8]:
print(yaml.safe_dump(new_config))


data_params:
  all_genes: false
  data_dir: data
  dset: pdac
  n_markers: 80
  n_mix: 70
  n_spots: 100000
  one_model: true
  samp_split: false
  sc_id: CA001063
  scaler_name: standard
  st_id: GSE111672
  st_split: false
lib_params:
  manual_seed: 1834367225
model_params:
  dann_kwargs:
    alpha_: 3
    dis_dropout_factor: 1
    dropout: 0.5
    emb_dim: 64
    enc_hidden_layer_sizes:
    - 256
    - 128
    enc_out_act: null
  model_version: gen_pdac-699417
train_params:
  adv_opt_kwargs:
    betas:
    - 0.5
    - 0.999
    eps: 1.0e-07
    lr: 1.0e-05
    weight_decay: 0.1
  batch_size: 128
  early_stop_crit: 100
  early_stop_crit_adv: 500
  epochs: 500
  lambda: 0.5
  min_epochs: 40
  min_epochs_adv: 100
  plateau_kwargs:
    factor: 0.5
    min_lr: 2.0e-06
    patience: 50
    verbose: true
  pretraining: false
  reverse_val: false
  source_first: true
  two_step: true



In [9]:
lines = [
    os.path.basename(name)
    for name in sorted(glob.glob(os.path.join(CONFIG_DIR, MODEL_NAME, f"{CONFIG_FNAME_PREFIX}-*.yml")))
]
with open(
    os.path.join(CONFIG_DIR, MODEL_NAME, "a_list.txt"),
    mode="wt",
    encoding="utf-8",
) as myfile:
    myfile.write("\n".join(lines))
    myfile.write("\n")