# Train DistMult Models on standardized datasets using published hyperparameters
given published hyperparameters, implement the parameters in PyKEEN for WN18, WN18RR, FB15k & FB15k-237

## DistMult Hyperparameters

The published parameters can be found [here](https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding/blob/master/best_config.sh) and shown below
```bash
#             1     2     3     4 5   6   7   8    9    10  11    12     13  14
bash run.sh train DistMult FB15k 0 0 1024 256 2000 500.0 1.0 0.001 150000 16 -r 0.000002
bash run.sh train DistMult FB15k-237 0 0 1024 256 2000 200.0 1.0 0.001 100000 16 -r 0.00001
bash run.sh train DistMult wn18 0 0 512 1024 1000 200.0 1.0 0.001 80000 8 -r 0.00001
bash run.sh train DistMult wn18rr 0 0 512 1024 1000 200.0 1.0 0.002 80000 8 -r 0.000005
```


The format is as follow:
```bash
MODE=$1
MODEL=$2
DATASET=$3
GPU_DEVICE=$4
SAVE_ID=$5

FULL_DATA_PATH=$DATA_PATH/$DATASET
SAVE=$SAVE_PATH/"$MODEL"_"$DATASET"_"$SAVE_ID"

#Only used in training
BATCH_SIZE=$6
NEGATIVE_SAMPLE_SIZE=$7 # keep in mind, this is not a sampling ratio
HIDDEN_DIM=$8
GAMMA=$9
ALPHA=${10}
LEARNING_RATE=${11}
MAX_STEPS=${12}
TEST_BATCH_SIZE=${13}
```

# Train a DistMult model using PyKEEN

## Train a DistMult model on FB15k

```bash
#             1     2        3   4 5   6   7   8    9    10   11     12   13  14
bash run.sh train DistMult FB15k 0 0 1024 256 2000 500.0 1.0 0.001 150000 16 -r 0.000002
```

In [1]:
import pykeen
import pykeen.datasets as pkd
import pykeen.pipeline as pkp
import sys
import os
import torch
import torch.nn
import numpy as np
import torch.nn.functional as F

In [2]:
model_kwargs = dict(
    # Model
    model="DistMult",
    model_kwargs=dict(embedding_dim=2000),
    # Loss
    loss="NSSALoss",
    loss_kwargs=dict(
        margin=500.0,  # gamma
        adversarial_temperature=1.0,  # alpha
    ),
    # Regularization
    regularizer="LpRegularizer",
    regularizer_kwargs=dict(weight=0.000002, p=3),  # L3regularization
    # Training
    training_kwargs=dict(
        num_epochs=317,
        batch_size=1024,
        checkpoint_frequency=0,
        checkpoint_name="DistMult_FB15k.pt",
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        num_negs_per_pos=256,  # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs=dict(lr=0.001),
    # lr scheduler
    lr_scheduler="StepLR",
    lr_scheduler_kwargs=dict(
        gamma=0.1, step_size=158
    ),  # 1/10 the learning rate every 158 epochs
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(
        project="KGE-on-time-results", group="FB15k", id="DistMult-FB15k"
    ),
    # Misc
    device="cuda:0",  # use gpu position 1
)

In [None]:
dataset = pkd.get_dataset(dataset="FB15k")
res = pkp.pipeline(dataset=dataset, **model_kwargs)

In [4]:
del res
torch.cuda.empty_cache()

## Train DistMult Model on FB15k-237
```bash
#             1     2           3    4 5   6   7   8    9    10   11    12    13 
bash run.sh train DistMult FB15k-237 0 0 1024 256 2000 200.0 1.0 0.001 100000 16 -r 0.00001
```

In [5]:
dataset = pkd.get_dataset(dataset="FB15k-237")

INFO:pykeen.datasets.utils:Loading cached preprocessed dataset from file:///home/rogertu/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/validation
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


In [6]:
f"Epoch Size: {100000/(dataset.training.num_triples/1024)}"

'Epoch Size: 376.31148595263033'

In [7]:
model_kwargs = dict(
    # Model
    model="DistMult",
    model_kwargs=dict(embedding_dim=2000),
    # Loss
    loss="NSSALoss",
    loss_kwargs=dict(
        margin=200.0,  # gamma
        adversarial_temperature=1.0,  # alpha
    ),
    # Regularization
    regularizer="LpRegularizer",
    regularizer_kwargs=dict(weight=0.00001, p=3),  # L3regularization
    # Training
    training_kwargs=dict(
        num_epochs=376,
        batch_size=1024,
        checkpoint_frequency=0,
        checkpoint_name="DistMult_FB15k237.pt",
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        num_negs_per_pos=256,  # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs=dict(lr=0.001),
    # lr scheduler
    lr_scheduler="StepLR",
    lr_scheduler_kwargs=dict(gamma=0.1, step_size=188),  # gamma 1/10 the learning rate
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(
        project="KGE-on-time-results", group="FB15k-237", id="DistMult-FB15k237"
    ),
    # Misc
    device="cuda:0",  # use gpu position 1
)

In [8]:
res = pkp.pipeline(dataset=dataset, **model_kwargs)

In [14]:
del res
torch.cuda.empty_cache()

## Train DistMult Model on WN18
```bash
#             1     2     3   4 5   6   7   8    9   10   11    12   13           14
bash run.sh train DistMult wn18 0 0 512 1024 1000 200.0 1.0 0.001 80000 8 -r 0.00001
```

In [11]:
dataset = pkd.get_dataset(dataset="WN18")
f"Epoch Size: {80000/(dataset.training.num_triples/512)}"

INFO:pykeen.datasets.utils:Loading cached preprocessed dataset from file:///home/rogertu/.data/pykeen/datasets/wn18/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/validation
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


'Epoch Size: 289.5886653186465'

In [15]:
model_kwargs = dict(
    # Model
    model="DistMult",
    model_kwargs=dict(embedding_dim=1000),
    # Loss
    loss="NSSALoss",
    loss_kwargs=dict(
        margin=200.0,  # gamma
        adversarial_temperature=1.0,  # alpha
    ),
    # Regularization
    regularizer="LpRegularizer",
    regularizer_kwargs=dict(weight=0.00001, p=3),  # L3regularization
    # Training
    training_kwargs=dict(
        num_epochs=290,
        batch_size=512,
        checkpoint_frequency=0,
        checkpoint_name="DistMult_WN18.pt",
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        num_negs_per_pos=1024,  # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs=dict(lr=0.001),
    # lr scheduler
    lr_scheduler="StepLR",
    lr_scheduler_kwargs=dict(
        gamma=0.1, step_size=145
    ),  # 1/10 the learning rate every 145 epochs
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(
        project="KGE-on-time-results", group="WN18", id="DistMult-WN18"
    ),
    # Misc
    device="cuda:0",  # use gpu position 1
)

In [13]:
res = pkp.pipeline(dataset=dataset, **model_kwargs)

In [17]:
del res
torch.cuda.empty_cache()

## Train DistMult Model on WN18RR
```bash
#             1     2       3     4 5   6   7   8    9    10  11    12   13 
bash run.sh train DistMult wn18rr 0 0 512 1024 1000 200.0 1.0 0.002 80000 8 -r 0.000005
```

In [18]:
dataset = pkd.get_dataset(dataset="WN18RR")
f"Epoch Size: {80000/(dataset.training.num_triples/512)}"

INFO:pykeen.datasets.utils:Loading cached preprocessed dataset from file:///home/rogertu/.data/pykeen/datasets/wn18rr/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18rr/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18rr/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18rr/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/validation
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


'Epoch Size: 471.699199631485'

In [19]:
model_kwargs = dict(
    # Model
    model="DistMult",
    model_kwargs=dict(embedding_dim=1000),
    # Loss
    loss="NSSALoss",
    loss_kwargs=dict(
        margin=200.0,  # gamma
        adversarial_temperature=1.0,  # alpha
    ),
    # Regularization
    regularizer="LpRegularizer",
    regularizer_kwargs=dict(weight=0.000005, p=3),  # L3regularization
    # Training
    training_kwargs=dict(
        num_epochs=472,
        batch_size=512,
        checkpoint_frequency=0,
        checkpoint_name="DistMult_WN18RR.pt",
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        num_negs_per_pos=1024,  # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs=dict(lr=0.002),
    # lr scheduler
    lr_scheduler="StepLR",
    lr_scheduler_kwargs=dict(gamma=0.1, step_size=236),  # 1/10 the learning rate
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(
        project="KGE-on-time-results", group="WN18RR", id="DistMult-WN18RR"
    ),
    # Misc
    device="cuda:0",  # use gpu position 0
)

In [None]:
res = pkp.pipeline(dataset=dataset, **model_kwargs)