## The above code cell is used for installing torch-geometric and dependencies

In [None]:
!git clone https://github.com/tolios/query2vec.git

In [None]:
!wget https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.2.zip
!unzip ./FB15K-237.2.zip
!mv ./Release ./FB15k_237
!rm FB15K-237.2.zip

## Data Generation


In [None]:
# Add this in a Google Colab cell to install the correct version of Pytorch Geometric.
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric
!pip install mlflow==2.5

In [None]:
bash_script = '''
  #!/usr/bin/env bash
  dataset_path="./FB15k_237/"
  qa_folder="qa_ionly_full" #149689
  train_path=$dataset_path"train.txt"
  val_path=$dataset_path"valid.txt"
  test_path=$dataset_path"test.txt"
  train_query_orders="[[(1, -1), (2, 149689), (3, 149689)]]"
  val_query_orders="[[(1, -1), (2, 5000), (3, 5000)]]"
  test_query_orders="[[], [(2, 5000)], [(3, 5000)], [(3, 5000)], [(3, 500)]]"
  include_train='[["1p", "2i", "3i"]]'
  include_val='[["1p", "2i", "3i"]]'
  include_test='[["1p"], ["2i"], ["3i"], ["pi"], ["ip"]]'

  python ./query2vec/graph.py $train_path $val_path $test_path \
              --qa_folder=$qa_folder --train_query_orders="$train_query_orders" \
              --val_query_orders="$val_query_orders" --test_query_orders="$test_query_orders" \
              --include_train="$include_train" --include_val="$include_val" --include_test="$include_test" \
              --add_inverse=true

  test_filter=$dataset_path$qa_folder"/filter.pkl"
  val_filter=$dataset_path$qa_folder"/val_filter.pkl"
  train_ds=$dataset_path$qa_folder"/train_qa_1.txt"
  val_ds=$dataset_path$qa_folder"/val_qa_1.txt"

  python ./query2vec/create_filter.py $test_filter $train_ds $val_ds $val_filter
'''

# Save the Bash script to a file
with open('generate.sh', 'w') as f:
    f.write(bash_script)

!bash ./generate.sh

Define model architecture and training specs...

In [12]:
import json

model_json = '''{
    "model_type": "MDRTkernel",
    "model_params": {
        "kernel": "rgcn",
        "T": 0.1,
        "T_emb": 0.25,
        "aggregation": "sum",
        "dynamic": false,
        "conv_dims" : [300],
        "num_bases" : null,
        "num_blocks" : null,
        "linear_dims": [],
        "heads": 1,
        "emb_dim": 200,
        "p": 0.0,
        "margin": 1.0
    }
}
'''

config_json = '''{
    "experiment": "query2vec_init",
    "experiment_id": null,
    "run": "qa_MDRTkernel_ionlyfull_1",
    "config": {
        "seed": 42,
        "train_data": "./FB15k_237/qa_ionly_full/train_qa_1.txt",
        "val_data": "./FB15k_237/qa_ionly_full/val_qa_1.txt",
        "epochs": 20,
        "train_batch_size": 1024,
        "val_batch_size": 1024,
        "num_negs": 50,
        "lr": 0.0001,
        "wd": 0.000001,
        "patience": 2,
        "val_every": 1,
        "scheduler_patience": 4,
        "scheduler_factor": 0.1,
        "scheduler_threshold": 0.1,
        "pretrained": null
    }
}
'''

# Parse JSON strings into Python dictionaries
model_json = json.loads(model_json)
config_json = json.loads(config_json)

# Write JSON data to files
with open('model.json', 'w') as f:
    json.dump(model_json, f)

with open('train_config.json', 'w') as f:
    json.dump(config_json, f)

Training ...

In [None]:
!python query2vec/run.py ./ --all_scaled