# Train TransE Model on standardized datasets using published hyperparameters
given prior published hyperparameters, implement the chosen parameters in PyKEEN for WN18, WN18RR, FB15k, and FB15k-237

## TransE Hyperparameters

The published parameters can be found [here](https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding/blob/master/best_config.sh) and are shown below
```bash

#             1     2     3    4 5   6   7   8    9    10  11    12     13  14
bash run.sh train TransE FB15k 0 0 1024 256 1000 24.0 1.0 0.0001 150000 16
bash run.sh train TransE FB15k-237 0 0 1024 256 1000 9.0 1.0 0.00005 100000 16
bash run.sh train TransE wn18 0 0 512 1024 500 12.0 0.5 0.0001 80000 8
bash run.sh train TransE wn18rr 0 0 512 1024 500 6.0 0.5 0.00005 80000 8
```


The format is as follow:
```bash
MODE=$1
MODEL=$2
DATASET=$3
GPU_DEVICE=$4
SAVE_ID=$5

FULL_DATA_PATH=$DATA_PATH/$DATASET
SAVE=$SAVE_PATH/"$MODEL"_"$DATASET"_"$SAVE_ID"

#Only used in training
BATCH_SIZE=$6
NEGATIVE_SAMPLE_SIZE=$7 # keep in mind, this is not a sampling ratio
HIDDEN_DIM=$8
GAMMA=$9
ALPHA=${10}
LEARNING_RATE=${11}
MAX_STEPS=${12}
TEST_BATCH_SIZE=${13}
```

# Train TransE models using PyKEEN

## Train a TransE model on FB15k

```bash
#             1     2     3    4 5   6   7   8    9    10  11    12     13  14
bash run.sh train TransE FB15k 0 0 1024 256 1000 24.0 1.0 0.0001 150000 16
```

In [1]:
import pykeen
import pykeen.datasets as pkd
import pykeen.pipeline as pkp
import sys
import os
import torch
import torch.nn
import numpy as np
import torch.nn.functional as F

In [2]:
model_kwargs = dict(
    # Model
    model="TransE",
    model_kwargs=dict(embedding_dim=1000, scoring_fct_norm=2),
    # Loss
    loss="NSSALoss",
    loss_kwargs=dict(
        margin=24.0,  # gamma
        adversarial_temperature=1.0,  # alpha
    ),
    # Training
    training_kwargs=dict(
        num_epochs=317,
        batch_size=1024,
        checkpoint_frequency=0,
        checkpoint_name="TransE_FB15k.pt",
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        num_negs_per_pos=256,  # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs=dict(lr=0.0001),
    # lr scheduler
    lr_scheduler="StepLR",
    lr_scheduler_kwargs=dict(
        gamma=0.1, step_size=158
    ),  # 1/10 the learning rate every 158 epochs
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(
        project="KGE-on-time-results", group="FB15k", id="TransE-FB15k"
    ),
    # Misc
    device="cuda:1",  # use gpu position 1
)

In [None]:
dataset = pkd.get_dataset(dataset="FB15k")
res = pkp.pipeline(dataset=dataset, **model_kwargs)

In [None]:
del res
torch.cuda.empty_cache()

## Train a model on FB15k-237

```bash
#             1     2       3      4 5   6   7   8    9    10  11    12     13
bash run.sh train TransE FB15k-237 0 0 1024 256 1000 9.0 1.0 0.00005 100000 16
```

### import dataset

In [2]:
dataset = pkd.get_dataset(dataset="FB15k-237")

  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


In [3]:
f"Number of Epochs: {100000 /(dataset.training.num_triples/1024)}"

'Number of Epochs: 376.31148595263033'

In [6]:
model_kwargs = dict(
    # Model
    model="TransE",
    model_kwargs=dict(embedding_dim=1000, scoring_fct_norm=2),
    # Loss
    loss="NSSALoss",
    loss_kwargs=dict(
        margin=9.0,  # gamma
        adversarial_temperature=1.0,  # alpha
    ),
    # Training
    training_kwargs=dict(
        num_epochs=376,  # 272,115 training triples, 100,000 iterations, with batch sz of 1024 means 376 epochs
        batch_size=1024,
        checkpoint_frequency=0,
        checkpoint_name="TransE_FB15k237.pt",
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        num_negs_per_pos=256,  # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs=dict(lr=0.00005),
    # lr scheduler
    lr_scheduler="StepLR",
    lr_scheduler_kwargs=dict(
        gamma=0.1, step_size=188
    ),  # 1/10 the learning rate every 188 epochs
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(
        project="KGE-on-time-results", group="FB15k-237", id="TransE-FB15k237"
    ),
    # Misc
    device="cuda:1",  # use gpu position 1
)

In [None]:
res = pkp.pipeline(dataset=dataset, **model_kwargs)

In [None]:
del res
torch.cuda.empty_cache()

## Train model on WN18

```bash
#             1     2     3   4 5   6   7   8    9   10   11    12  13
bash run.sh train TransE wn18 0 0 512 1024 500 12.0 0.5 0.0001 80000 8
```

In [2]:
dataset = pkd.get_dataset(dataset="WN18")

  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


In [3]:
f"Epoch Size: {80000/(dataset.training.num_triples/512)}"

'Epoch Size: 289.5886653186465'

In [4]:
model_kwargs = dict(
    # Model
    model="TransE",
    model_kwargs=dict(embedding_dim=500, scoring_fct_norm=2),
    # Loss
    loss="NSSALoss",
    loss_kwargs=dict(
        margin=12.0,  # gamma
        adversarial_temperature=0.5,  # alpha
    ),
    # Training
    training_kwargs=dict(
        num_epochs=290,
        batch_size=512,
        checkpoint_frequency=0,
        checkpoint_name="TransE_WN18.pt",
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        num_negs_per_pos=1024,  # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs=dict(lr=0.0001),
    # lr scheduler
    lr_scheduler="StepLR",
    lr_scheduler_kwargs=dict(
        gamma=0.1, step_size=145
    ),  # 1/10 the learning rate every 145 epochs
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(
        project="KGE-on-time-results", group="WN18", id="TransE-WN18"
    ),
    # Misc
    device="cuda:1",  # use gpu position 1
)

In [None]:
res = pkp.pipeline(dataset=dataset, **model_kwargs)

In [5]:
del res
torch.cuda.empty_cache()

## Train model on WN18RR
```bash
#             1     2      3    4 5   6   7   8    9    10  11    12  13
bash run.sh train TransE wn18rr 0 0 512 1024 500 6.0 0.5 0.00005 80000 8
```

In [6]:
dataset = pkd.get_dataset(dataset="WN18RR")

INFO:pykeen.datasets.utils:Loading cached preprocessed dataset from file:///home/rogertu/.data/pykeen/datasets/wn18rr/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18rr/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18rr/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
INFO:pykeen.triples.triples_factory:Loading from file:///home/rogertu/.data/pykeen/datasets/wn18rr/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/validation
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


In [7]:
f"Epoch Size: {80000/(dataset.training.num_triples/512)}"

'Epoch Size: 471.699199631485'

In [5]:
model_kwargs = dict(
    # Model
    model="TransE",
    model_kwargs=dict(embedding_dim=500, scoring_fct_norm=2),
    # Loss
    loss="NSSALoss",
    loss_kwargs=dict(
        margin=6.0,  # gamma
        adversarial_temperature=0.5,  # alpha
    ),
    # Training
    training_kwargs=dict(
        num_epochs=472,
        # however, for some reason, each epoch is 472 iterations, so 150,000/472 = 317 epochs
        batch_size=512,
        checkpoint_frequency=0,
        checkpoint_name="TransE_WN18RR.pt",
    ),
    # Negative Sampler
    negative_sampler="basic",
    negative_sampler_kwargs=dict(
        num_negs_per_pos=1024,  # corruption_scheme=("h","r","t",),  # defines which part of the triple to corrupt
        filtered=True,  # Uses a default 'Bloom' filter to minimize false negatives
    ),
    # optimizer
    optimizer="Adam",
    optimizer_kwargs=dict(lr=0.00005),
    # lr scheduler
    lr_scheduler="StepLR",
    lr_scheduler_kwargs=dict(gamma=0.1, step_size=236),  # 1/10 the learning rate
    # Tracking
    result_tracker="wandb",
    result_tracker_kwargs=dict(
        project="KGE-on-time-results", group="WN18RR", id="TransE-WN18RR"
    ),
    # Misc
    device="cuda:1",  # use gpu position 1
)

In [None]:
res = pkp.pipeline(dataset=dataset, **model_kwargs)

In [6]:
del res
torch.cuda.empty_cache()