# Run Hyperparameter Optimization on a Miniturized MIND dataset using HolE KGEM

* Miniturized MIND dataset contains 3 node types and 4 edge types
* Hits at 10: 3387
* Best parameters:
    * embedding dimensions: 512
    * loss (margin): 3
    * loss (temp): 0.8280480803858479
    * optimizer learning rate: 0.05731112619656342
    * negative sampler negative to positive ratio: 15
    * batch size: 256

In [None]:
import gc
import wandb
import os
import pykeen

import optuna
from pykeen.hpo import hpo_pipeline
from pykeen.triples import TriplesFactory
import pykeen.nn.compositions as compositions
import pykeen.nn.modules as modules

wandb.login()

## Setup dataset split and Optuna storage server

In [2]:
storage = optuna.storages.RDBStorage(
    url="postgresql+psycopg2://rogertu:admin@localhost:5432/mind"
)  # oops put it in the wrong server

In [3]:
tf = TriplesFactory.from_path(
    path="/home/rogertu/projects/Consilience-Drug-Repurposing/data/mini_MIND/graph.tsv",
    create_inverse_triples=True,
    delimiter="\t",
)

train, test, valid = tf.split(
    ratios=[0.8, 0.1, 0.1],
    random_state=42,
)

# Hyperparameter Optimization

* `batch size`: should be set as a fraction of the total train size. The `hpo_train` size is 1,017,388. Round to 1E6 for convenience.

In [None]:
hpo_result = hpo_pipeline(
    training=train,
    testing=test,
    validation=valid,
    # Model
    model="HolE",
    model_kwargs_ranges=dict(
        embedding_dim=dict(type=int, low=7, high=9, scale="power_two"),
    ),
    # Loss
    loss="NSSALoss",
    # Regularization
    # regularizer="LpRegularizer",
    # Training
    training_kwargs=dict(
        num_epochs=500,
        checkpoint_frequency=0,
    ),
    training_kwargs_ranges=dict(
        batch_size=dict(type=int, low=8, high=11, scale="power_two")
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # negative_sampler_kwargs_ranges=dict(
    #     num_negs_per_pos=dict(type=int, low=1, high=100, log=True),
    # ),  # default neg sampling strat
    # optimizer
    optimizer="adagrad",
    optimizer_kwargs=dict(weight_decay=0.0),
    optimizer_kwargs_ranges=dict(
        lr=dict(type=float, low=0.0001, high=1.0, scale="log")
    ),  # suggested not to optimize the optimizer
    # evaluator
    evaluation_relation_whitelist=["indication", "treats"],
    # earlystopper
    stopper="early",
    stopper_kwargs=dict(
        patience=3,
        relative_delta=0.002,
    ),  # no frequency var, otherwise it forces an eval at specified epoch
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(project="MIND-KGE", group="mini-hole-hpo"),
    # Optuna Parameters
    study_name="mini_hole_hpo",
    storage=storage,
    load_if_exists=True,
    n_trials=30,
    metric="both.realistic.hits_at_10",  #
    direction="maximize",  # default is maximize, because default metric is MRR, but when running previously, only minimized...
    # Misc
    device="cuda:1",  # use gpu position 1
    gc_after_trial=True,  # garbage collect after each trial
)

## Best HPO Parameters on the miniturized dataset

In [3]:
best_trial = optuna.load_study(study_name="mini_hole_hpo", storage=storage)

In [4]:
best_trial.best_trial

FrozenTrial(number=22, state=1, values=[0.3386619718309859], datetime_start=datetime.datetime(2024, 12, 15, 17, 14, 50, 103580), datetime_complete=datetime.datetime(2024, 12, 16, 20, 44, 16, 224410), params={'model.embedding_dim': 512, 'loss.margin': 3, 'loss.adversarial_temperature': 0.8280480803858479, 'optimizer.lr': 0.05731112619656342, 'negative_sampler.num_negs_per_pos': 15, 'training.batch_size': 256}, user_attrs={'stopped_epoch': 70, 'random_seed': 1476899248, 'head.optimistic.variance': 90713190.48254068, 'tail.optimistic.variance': 28733051.820145585, 'both.optimistic.variance': 60765161.555188574, 'head.realistic.variance': 90713224.0, 'tail.realistic.variance': 28733060.0, 'both.realistic.variance': 60765184.0, 'head.pessimistic.variance': 90713267.718378, 'tail.pessimistic.variance': 28733067.038417358, 'both.pessimistic.variance': 60765208.21356873, 'head.optimistic.adjusted_arithmetic_mean_rank_index': 0.8870297941394345, 'tail.optimistic.adjusted_arithmetic_mean_rank_in

In [5]:
best_trial.best_params

{'model.embedding_dim': 512,
 'loss.margin': 3,
 'loss.adversarial_temperature': 0.8280480803858479,
 'optimizer.lr': 0.05731112619656342,
 'negative_sampler.num_negs_per_pos': 15,
 'training.batch_size': 256}