# Make entities and relations.dict files for MIND, MIND_CtD and MIND_CiD

In [1]:
import pandas as pd
import os
import random

## Entities and Relations dict file for MIND

In [3]:
graph = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND/graph.txt",
    sep="\t",
    names=["h", "r", "t"],
)
train = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND/train.txt",
    sep="\t",
    names=["h", "r", "t"],
)
test = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND/test.txt",
    sep="\t",
    names=["h", "r", "t"],
)
valid = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND/valid.txt", sep="\t", names=["h", "r", "t"]
)

In [4]:
all_graph = pd.concat([graph, train, test, valid])
all_graph = all_graph.drop_duplicates()

In [5]:
all_graph.head()

Unnamed: 0,h,r,t
0,UNII:BTY153760O,inhibits_CinG,NCBIGene:3605
1,CHEBI:10056,activates_CaG,NCBIGene:1129
2,CHEBI:10056,activates_CaG,NCBIGene:1131
3,CHEBI:10056,activates_CaG,NCBIGene:1133
4,CHEBI:10056,activates_CaG,NCBIGene:3350


In [6]:
all_graph_nodes = list(set(all_graph.h) | set(all_graph.t))
all_graph_relations = list(set(all_graph.r))

#### Order of file should be...
number \t id

In [7]:
node_num = random.sample(range(len(all_graph_nodes)), len(all_graph_nodes))
rel_num = random.sample(range(len(all_graph_relations)), len(all_graph_relations))

In [8]:
pd.DataFrame({"node_num": node_num, "nodes": all_graph_nodes}).to_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND/entities.dict",
    sep="\t",
    header=False,
    index=False,
)

In [9]:
pd.DataFrame({"rel_num": rel_num, "rel": all_graph_relations}).to_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND/relations.dict",
    sep="\t",
    header=False,
    index=False,
)

## MIND_CtD Entities and Relations dict generation

In [10]:
graph = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CtD/graph.txt",
    sep="\t",
    names=["h", "r", "t"],
)
train = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CtD/train.txt",
    sep="\t",
    names=["h", "r", "t"],
)
test = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CtD/test.txt",
    sep="\t",
    names=["h", "r", "t"],
)
valid = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CtD/valid.txt", sep="\t", names=["h", "r", "t"]
)

In [11]:
all_graph = pd.concat([graph, train, test, valid])
all_graph = all_graph.drop_duplicates()

all_graph_nodes = list(set(all_graph.h) | set(all_graph.t))
all_graph_relations = list(set(all_graph.r))

node_num = random.sample(range(len(all_graph_nodes)), len(all_graph_nodes))
rel_num = random.sample(range(len(all_graph_relations)), len(all_graph_relations))

In [12]:
# Entities
pd.DataFrame({"node_num": node_num, "nodes": all_graph_nodes}).to_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CtD/entities.dict",
    sep="\t",
    header=False,
    index=False,
)

In [13]:
# Relations
pd.DataFrame({"rel_num": rel_num, "rel": all_graph_relations}).to_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CtD/relations.dict",
    sep="\t",
    header=False,
    index=False,
)

## MIND_CiD Entities and Relations dict generation

In [14]:
graph = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CiD/graph.txt",
    sep="\t",
    names=["h", "r", "t"],
)
train = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CiD/train.txt",
    sep="\t",
    names=["h", "r", "t"],
)
test = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CiD/test.txt",
    sep="\t",
    names=["h", "r", "t"],
)
valid = pd.read_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CiD/valid.txt", sep="\t", names=["h", "r", "t"]
)

In [15]:
all_graph = pd.concat([graph, train, test, valid])
all_graph = all_graph.drop_duplicates()

all_graph_nodes = list(set(all_graph.h) | set(all_graph.t))
all_graph_relations = list(set(all_graph.r))

node_num = random.sample(range(len(all_graph_nodes)), len(all_graph_nodes))
rel_num = random.sample(range(len(all_graph_relations)), len(all_graph_relations))

In [16]:
# Entities
pd.DataFrame({"node_num": node_num, "nodes": all_graph_nodes}).to_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CiD/entities.dict",
    sep="\t",
    header=False,
    index=False,
)

In [17]:
# Relations
pd.DataFrame({"rel_num": rel_num, "rel": all_graph_relations}).to_csv(
    "../../KnowledgeGraphEmbedding00/data/MIND_CiD/relations.dict",
    sep="\t",
    header=False,
    index=False,
)