# Run Hyperparameter Optimization on a Miniturized MIND dataset using TransE KGEM
* Miniturized MIND dataset contains 3 node types and 4 edge types
* Hits at 10: 0.48697
* Best parameters:
    * embedding dimensions: 256
    * scoring function norm: 1
    * loss (margin): 9
    * loss (temp): 0.7552306044743602
    * optimizer learning rate: 0.0988476089246415
    * negative sampler negative to positive ratio: 72
    * batch size: 128
    

In [1]:
import gc
import wandb
import os
import pykeen
import optuna
import polars as pl
import numpy as np
from pykeen.hpo import hpo_pipeline
import pykeen.nn.compositions as compositions
import pykeen.nn.modules as modules
from pykeen.triples import TriplesFactory

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrogertu[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
storage = optuna.storages.RDBStorage(
    url="postgresql+psycopg2://rogertu:admin@localhost:5432/mind"
)

In [3]:
tf = TriplesFactory.from_path(
    path="/home/rogertu/projects/Consilience-Drug-Repurposing/data/mini_MIND/graph.tsv",
    create_inverse_triples=True,
    delimiter="\t",
)

In [4]:
train, test, valid = tf.split(
    ratios=[0.8, 0.1, 0.1],
    random_state=42,
)

# Hyperparameter Optimization

In [None]:
hpo_result = hpo_pipeline(
    training=train,
    testing=test,
    validation=valid,
    # Model
    model="TransE",
    model_kwargs=dict(
        scoring_fct_norm=1,
    ),
    model_kwargs_ranges=dict(
        embedding_dim=dict(type=int, low=7, high=9, scale="power_two"),
    ),
    # Loss
    loss="NSSALoss",
    # Training
    training_kwargs=dict(
        num_epochs=500,
    ),
    training_kwargs_ranges=dict(
        batch_size=dict(type=int, low=7, high=9, scale="power_two"),
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="adagrad",
    optimizer_kwargs=dict(weight_decay=0.0),
    optimizer_kwargs_ranges=dict(
        lr=dict(type="float", low=0.001, high=0.1, scale="log")
    ),
    # evaluator
    evaluation_relation_whitelist=["indication", "treats"],
    # earlystopper
    stopper="early",
    stopper_kwargs=dict(
        patience=3,
        relative_delta=0.002,
    ),  # no frequency var, otherwise it forces an eval at specified epoch
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(project="MIND-KGE", group="mini-transe-hpo"),
    # Optuna Parameters
    study_name="minimind_transe_hpo",
    storage=storage,
    load_if_exists=True,
    n_trials=30,
    metric="both.realistic.hits_at_10",  # default is MRR, specifically "both.realistic.inverse_harmonic_mean_rank"
    direction="maximize",  # default is maximize, because default metric is MRR, but when running previously, only minimized...
    # Misc
    device="cuda:1",  # use gpu position 1
    gc_after_trial=True,  # garbage collect after each trial
)

## Best HPO Parameters on the miniturized dataset

In [9]:
best_trial = optuna.load_study(study_name="minimind_transe_hpo", storage=storage)

In [10]:
best_trial.best_trial

FrozenTrial(number=14, state=1, values=[0.4869718309859155], datetime_start=datetime.datetime(2024, 12, 13, 11, 0, 9, 752153), datetime_complete=datetime.datetime(2024, 12, 17, 0, 56, 33, 552980), params={'model.embedding_dim': 256, 'loss.margin': 9, 'loss.adversarial_temperature': 0.7552306044743602, 'optimizer.lr': 0.0988476089246415, 'negative_sampler.num_negs_per_pos': 72, 'training.batch_size': 128}, user_attrs={'both.optimistic.adjusted_arithmetic_mean_rank': 0.024466076257830982, 'both.optimistic.adjusted_arithmetic_mean_rank_index': 0.9755643932988852, 'both.optimistic.adjusted_geometric_mean_rank_index': 0.9990730183057073, 'both.optimistic.adjusted_hits_at_k': 0.4868917005833104, 'both.optimistic.adjusted_inverse_harmonic_mean_rank': 0.2616901412165109, 'both.optimistic.arithmetic_mean_rank': 783.3469014084507, 'both.optimistic.count': 14200.0, 'both.optimistic.geometric_mean_rank': 22.83884406363946, 'both.optimistic.harmonic_mean_rank': 3.819353729676863, 'both.optimistic.h

In [11]:
best_trial.best_params

{'model.embedding_dim': 256,
 'loss.margin': 9,
 'loss.adversarial_temperature': 0.7552306044743602,
 'optimizer.lr': 0.0988476089246415,
 'negative_sampler.num_negs_per_pos': 72,
 'training.batch_size': 128}

In [12]:
best_trial.user_attrs

{'evaluator': 'rankbased',
 'filter_validation_when_testing': True,
 'loss': 'nssa',
 'metric': 'both.realistic.hits_at_10',
 'model': 'transe',
 'negative_sampler': 'basic',
 'optimizer': 'adagrad',
 'pykeen_git_hash': 'UNHASHED',
 'pykeen_version': '1.10.3-dev',
 'training_loop': 'slcwa'}