# Example of using PyKEEN to Optimize compGCN-TransE Hyperparameters
* Previously, I optimized TransE using PyKEEN's implementation of Optuna
* Using the optimized model parameters, I want to optimize the GNN implementation with the TransE scoring function

In [1]:
import optuna
import pykeen
import wandb
from pykeen.hpo import hpo_pipeline

wandb.login()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrogertu[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Default Knowledge Graph Embedding Hyperparameter Optimization settings
* these setting are the defaults in PyKEEN

In [2]:
pykeen.models.TransE.hpo_default

{'embedding_dim': {'type': int, 'low': 16, 'high': 256, 'q': 16},
 'scoring_fct_norm': {'type': int, 'low': 1, 'high': 2}}

In [3]:
pykeen.models.CompGCN.hpo_default

{'embedding_dim': {'type': int, 'low': 32, 'high': 512, 'q': 32}}

## Initialize Optuna

In [None]:
# setup your optuna postgresql storage server if you haven't yet
storage = optuna.storages.RDBStorage(
    url="postgresql+psycopg2://rogertu:admin@localhost/optuna_test",
    heartbeat_interval=60,
    grace_period=120,
)

# optuna.delete_study(storage = storage, study_name = 'cgcn-transe_hpo_time') # old studyname

In [None]:
# create an optuna study, setup the correct optimization direction, study name and place to store run results
optuna.study.create_study(
    storage=storage
    study_name="cgcn_corr_transe_hpo_time",
    direction="maximize",
    load_if_exists=True,
)

## Run HPO Pipeline

In [None]:
hpo_result = hpo_pipeline(
    # Dataset
    training="../data/time_networks-6_metanode/1987/hpo_trainno_notime.txt",
    testing="../data/time_networks-6_metanode/1987/hpo_test_notime.txt",
    validation="../data/time_networks-6_metanode/1987/hpo_valid_notime.txt",
    dataset_kwargs=dict(create_inverse_triples=True),
    # Model
    model="CompGCN",
    model_kwargs=dict(
        embedding_dim=100,
        encoder_kwargs=dict(
            # encoder_kwargs
            # https://pykeen.readthedocs.io/en/stable/_modules/pykeen/nn/representation.html#CombinedCompGCNRepresentations
            num_layers=2,
            layer_kwargs=dict(
                # layer_kwargs
                # https://pykeen.readthedocs.io/en/stable/_modules/pykeen/nn/representation.html#CompGCNLayer
                composition=pykeen.nn.compositions.CircularCorrelationCompositionModule
            ),
        ),
        interaction=pykeen.nn.modules.TransEInteraction,
        interaction_kwargs=dict(
            p=2,
        ),
    ),
    # Loss
    loss="InfoNCELoss",
    # Regularization
    # regularizer="LpRegularizer", # Unexpected kwargs?
    # Training
    training_kwargs=dict(
        num_epochs=10,
        checkpoint_frequency=0,
    ),
    training_kwargs_ranges=dict(batch_size=dict(type=int, low=144, high=288)),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs_ranges=dict(lr=dict(type=float, low=0.0001, high=0.001)),
    # lr scheduler
    lr_scheduler="ExponentialLR",
    lr_scheduler_kwargs_ranges=dict(
        gamma=dict(type=float, low=0.89, high=0.99, step=0.02)
    ),
    # earlystopper
    stopper="early",
    stopper_kwargs=dict(
        patience=1,
        relative_delta=0.0005,
    ),  # no frequency var, otherwise it forces an eval at specified epoch
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(project="KGE-on-time", group="cGCN-corr-TransE-hpo"),
    # Optuna Parameters
    study_name="cgcn_corr_transe_hpo_time",
    storage=storage,
    load_if_exists=True,
    n_trials=100,
    metric="tail.realistic.inverse_harmonic_mean_rank",  # default is MRR, specifically "both.realistic.inverse_harmonic_mean_rank"
    direction="maximize",  # default is maximize, because default metric is MRR, but when running previously, only minimized...
    # Misc
    device="cuda:0",  # use gpu position 0
)

# clear mem cache
del hpo_result
gc.collect()
torch.cuda.empty_cache()

# batch stats
* 1.08batch/second 9078 batch for 'fearless-fire-314'.
* ~2 hours and 20 minutes/epoch x 10 epochs is ~23 hours for 1 run... IF we do 100 rounds of HPO (which we can't cause it would take too long) it would take about 96 days to optimize
* 16.3GB RAM Usage