In [None]:
import os
import pykeen
import polars as pl
import numpy as np

# How to create a TriplesFactory Object from tab separated file
* Get files loaded as np arrays
* Get unique entities and relations
* Assign counts to the unique entities and relations
* Shuffle the values to get random assignment of entities and relation dict to ids
* Create your TriplesFactory Object

## Import your triples with pykeen

In [None]:
# create numpy factory for dataset
train = pykeen.triples.utils.load_triples(
    "./semmed/data/time_networks-6_metanode/1987/hpo_train_notime.txt",
)
test = pykeen.triples.utils.load_triples(
    "./semmed/data/time_networks-6_metanode/1987/hpo_test_notime.txt",
)
valid = pykeen.triples.utils.load_triples(
    "./semmed/data/time_networks-6_metanode/1987/hpo_valid_notime.txt",
)

## Check dimensions
* can create your inference group

In [None]:
train.shape

In [None]:
test.shape

In [None]:
valid.shape

### Check what the imported files look like

In [None]:
train[0:5]

### Create inference array

In [None]:
inference = np.concatenate([test, valid])
assert (
    test.shape[0] + valid.shape[0] == inference.shape[0]
), "Assumption that test and valid are added together in the same dimension is incorrect"

## Create entity2id and relation2id mapping
* get train file from import
* combine head and tail to get entities
* get unique relations
* assign values and shuffle for each dictionary

### training file

In [None]:
train_df = pl.read_csv(
    "./semmed/data/time_networks-6_metanode/1987/hpo_train_notime.txt",
    separator="\t",
    has_header=False,
).rename({"column_1": "head", "column_2": "relation", "column_3": "tail"})

train_df.head()

### get unique entities

In [None]:
# get unique entity series
entities = (
    train_df.select("head")
    .vstack(train_df.select("tail").rename({"tail": "head"}))
    .unique("head")
)
entities.head(2)

### get unique relations

In [None]:
# get unique relation series
relations = train_df.select("relation").unique("relation")

relations.head(2)

### create a np array of all ents and relations, then shuffle them

In [None]:
# create a np array of the entities and relations
# then shuffle the arrays, in-place
rel_arr = np.arange(relations.shape[0])
ent_arr = np.arange(entities.shape[0])
np.random.shuffle(rel_arr)  # shuffle the array, happens in-place
np.random.shuffle(ent_arr)  # shuffle the array, happens in-place

### Assign shuffled ids to the nodes
* create a dictionary

In [None]:
# assign the new, shuffled id to the nodes
e2id = entities.with_columns(new_id=pl.Series(ent_arr)).select(["head", "new_id"])
# create node mappings
e2id_dict = dict(zip(e2id["head"], e2id["new_id"]))

# assign the new, shuffled id to the edges
r2id = relations.with_columns(new_id=pl.Series(rel_arr)).select(["relation", "new_id"])
r2id_dict = dict(zip(r2id["relation"], r2id["new_id"]))

## Create Triples Factory Object

In [None]:
training_factory = pykeen.triples.TriplesFactory.from_labeled_triples(
    triples=train,
    create_inverse_triples=True,
    entity_to_id=e2id_dict,
    relation_to_id=r2id_dict,
)

training_factory

In [None]:
inference_factory = pykeen.triples.TriplesFactory.from_labeled_triples(
    triples=inference,
    create_inverse_triples=True,  # must be set to true for nodepiece
    entity_to_id=e2id_dict,
    relation_to_id=r2id_dict,
)

inference_factory