# Fix MIND-CtD relation names
* This notebook focuses on fixing indication relations and the types associated with each metanode. 
* Previously there was issues with incorrect node type assignments appeneded to the relations.
* This notebook will serve to double check the node type assignments, as well as check for duplicates, and re-map some 

In [5]:
import pandas as pd
import numpy as np
import re

## Fix MIND-CtD relations

In [6]:
train = pd.read_csv("../data/MIND_CtD/train.txt", names=["h", "r", "t"], sep="\t")
test = pd.read_csv("../data/MIND_CtD/test.txt", names=["h", "r", "t"], sep="\t")
valid = pd.read_csv("../data/MIND_CtD/valid.txt", names=["h", "r", "t"], sep="\t")

In [7]:
train['r2'] = train["r"].apply(lambda x: re.match("[a-z_]*[a-z]", x)[0])
train.head()

Unnamed: 0,h,r,t,r2
0,UNII:BTY153760O,inhibits_CinG,NCBIGene:3605,inhibits
1,NCBIGene:4116,part_of_GpoBP,GO:0003723,part_of
2,NCBIGene:4116,part_of_GpoBP,GO:0005515,part_of
3,NCBIGene:4116,part_of_GpoA,GO:0005634,part_of
4,NCBIGene:4116,part_of_GpoA,GO:0005737,part_of


In [8]:
train["set"] = "train"
test["set"] = "test"
valid["set"] = "valid"

graph = pd.concat([train, test, valid])

In [9]:
# import nodes
nodes = pd.read_csv("/home/rogertu/MRN_dataset/nodes_biolink.csv")
nodes.head()

  nodes = pd.read_csv("/home/rogertu/MRN_dataset/nodes_biolink.csv")


Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
0,UBERON:0000002,cervix,AnatomicalEntity,MESH:D002584|UBERON:0000002,,,,
1,UBERON:0000004,human nose,AnatomicalEntity,MESH:D009666|UBERON:0000004,,,,
2,UBERON:0000006,islet of Langerhans,AnatomicalEntity,MESH:D007515|UBERON:0000006,,,,
3,UBERON:0000007,pituitary gland,AnatomicalEntity,MESH:D010902|UBERON:0000007,,,,
4,UBERON:0000010,peripheral nervous system,AnatomicalEntity,MESH:D017933|UBERON:0000010,,,,


In [10]:
# expand nodes
# split xrefs by '|' and create a id to xref dictionary to map any diseases to later
nodes_noxrefs = nodes.query("xrefs!=xrefs")

nodes_xrefs = nodes.query("xrefs==xrefs")
nodes_xrefs["xrefs_expand"] = nodes_xrefs["xrefs"].apply(lambda x: x.split("|"))
nodes_xrefs = nodes_xrefs[["id", "label", "xrefs_expand"]].explode("xrefs_expand")
nodes_xrefs = nodes_xrefs.rename(columns={"xrefs_expand": "xrefs"})

nodes = pd.concat([nodes_noxrefs[["id", "label", "xrefs"]], nodes_xrefs])
nodes.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes_xrefs["xrefs_expand"] = nodes_xrefs["xrefs"].apply(lambda x: x.split("|"))


Unnamed: 0,id,label,xrefs
8,UBERON:0000016,AnatomicalEntity,
9,UBERON:0000017,AnatomicalEntity,
10,UBERON:0000018,AnatomicalEntity,
11,UBERON:0000019,AnatomicalEntity,
12,UBERON:0000020,AnatomicalEntity,


In [11]:
nodee2label = dict(zip(nodes["id"], nodes["label"]))

### Map start and end node to a label

In [12]:
graph["hlabel"] = graph["h"].apply(lambda x: nodee2label[x])
graph["tlabel"] = graph["t"].apply(lambda x: nodee2label[x])
graph.head()

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel
0,UNII:BTY153760O,inhibits_CinG,NCBIGene:3605,inhibits,train,ChemicalSubstance,MacromolecularMachine
1,NCBIGene:4116,part_of_GpoBP,GO:0003723,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity
2,NCBIGene:4116,part_of_GpoBP,GO:0005515,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity
3,NCBIGene:4116,part_of_GpoA,GO:0005634,part_of,train,MacromolecularMachine,AnatomicalEntity
4,NCBIGene:4116,part_of_GpoA,GO:0005737,part_of,train,MacromolecularMachine,AnatomicalEntity


### get relations

In [13]:
graph["rel"] = graph["r"].apply(lambda x: re.match("[a-z_]*[a-z]", x)[0])
graph.head()

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel
0,UNII:BTY153760O,inhibits_CinG,NCBIGene:3605,inhibits,train,ChemicalSubstance,MacromolecularMachine,inhibits
1,NCBIGene:4116,part_of_GpoBP,GO:0003723,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity,part_of
2,NCBIGene:4116,part_of_GpoBP,GO:0005515,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity,part_of
3,NCBIGene:4116,part_of_GpoA,GO:0005634,part_of,train,MacromolecularMachine,AnatomicalEntity,part_of
4,NCBIGene:4116,part_of_GpoA,GO:0005737,part_of,train,MacromolecularMachine,AnatomicalEntity,part_of


### Map REACT/KEGG indications to diseases

In [14]:
graph["tsource"] = graph["t"].apply(lambda x: x.split(":")[0])

graph_kegg = graph.query('tsource=="KEGG"')
graph_notkegg = graph.query('tsource!="KEGG"')

In [15]:
graph_kegg["new_t"] = graph_kegg["t"].apply(
    lambda x: x.split("hsa")[0] + x.split("hsa")[1]
)
graph_kegg.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  graph_kegg["new_t"] = graph_kegg["t"].apply(


Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel,tsource,new_t
3017856,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04510,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04510
3017857,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04068,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04068
3017858,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04012,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04012
3017859,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa05231,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:05231
3017860,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04062,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04062


In [16]:
graph_kegg.shape

(400482, 10)

In [17]:
graph_kegg = pd.merge(
    left=graph_kegg,
    right=nodes[["id", "xrefs"]],
    left_on="new_t",
    right_on="xrefs",
    how="left",
)

graph_kegg

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel,tsource,new_t,id,xrefs
0,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04510,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04510,,
1,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04068,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04068,,
2,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04012,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04012,,
3,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa05231,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:05231,,
4,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04062,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04062,,
...,...,...,...,...,...,...,...,...,...,...,...,...
400477,CHEBI:79699,indication_CiPW,KEGG:hsa05211,,valid,ChemicalSubstance,Pathway,indication,KEGG,KEGG:05211,,
400478,CHEBI:41774,indication_CiPW,KEGG:hsa05224,,valid,ChemicalSubstance,Pathway,indication,KEGG,KEGG:05224,,
400479,CHEBI:45863,indication_CiPW,KEGG:hsa05224,,valid,ChemicalSubstance,Pathway,indication,KEGG,KEGG:05224,,
400480,CHEBI:50924,indication_CiPW,KEGG:hsa05211,,valid,ChemicalSubstance,Pathway,indication,KEGG,KEGG:05211,,


In [18]:
# Only diseases in the id column
set(graph_kegg.dropna(subset="id")["id"].apply(lambda x: x.split(":")[0]))

set()

In [19]:
set(graph_kegg["tlabel"])

{'Pathway'}

In [20]:
tmp_ls = list()  # new 't' if its not empty, otherwise old 't'
tmp_ls2 = list()  #

for i, v in enumerate(graph_kegg["id"]):
    if type(v) == float:
        tmp_ls.append(graph_kegg["t"][i])
        tmp_ls2.append("Pathway")
    else:
        tmp_ls.append(v)
        tmp_ls2.append("Disease")

In [21]:
graph_kegg["t"] = tmp_ls
graph_kegg["tlabel"] = tmp_ls2
graph_kegg

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel,tsource,new_t,id,xrefs
0,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04510,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04510,,
1,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04068,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04068,,
2,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04012,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04012,,
3,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa05231,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:05231,,
4,IKEY:AXTAPYRUEKNRBA-JTQLQIEISA-N,associated_with_CawPW,KEGG:hsa04062,associated_with,train,ChemicalSubstance,Pathway,associated_with,KEGG,KEGG:04062,,
...,...,...,...,...,...,...,...,...,...,...,...,...
400477,CHEBI:79699,indication_CiPW,KEGG:hsa05211,,valid,ChemicalSubstance,Pathway,indication,KEGG,KEGG:05211,,
400478,CHEBI:41774,indication_CiPW,KEGG:hsa05224,,valid,ChemicalSubstance,Pathway,indication,KEGG,KEGG:05224,,
400479,CHEBI:45863,indication_CiPW,KEGG:hsa05224,,valid,ChemicalSubstance,Pathway,indication,KEGG,KEGG:05224,,
400480,CHEBI:50924,indication_CiPW,KEGG:hsa05211,,valid,ChemicalSubstance,Pathway,indication,KEGG,KEGG:05211,,


In [22]:
graph = pd.concat(
    [
        graph_notkegg,
        graph_kegg[["h", "r", "t", "set", "hlabel", "tlabel", "rel", "tsource"]],
    ]
)
graph

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel,tsource
0,UNII:BTY153760O,inhibits_CinG,NCBIGene:3605,inhibits,train,ChemicalSubstance,MacromolecularMachine,inhibits,NCBIGene
1,NCBIGene:4116,part_of_GpoBP,GO:0003723,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity,part_of,GO
2,NCBIGene:4116,part_of_GpoBP,GO:0005515,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity,part_of,GO
3,NCBIGene:4116,part_of_GpoA,GO:0005634,part_of,train,MacromolecularMachine,AnatomicalEntity,part_of,GO
4,NCBIGene:4116,part_of_GpoA,GO:0005737,part_of,train,MacromolecularMachine,AnatomicalEntity,part_of,GO
...,...,...,...,...,...,...,...,...,...
400477,CHEBI:79699,indication_CiPW,KEGG:hsa05211,,valid,ChemicalSubstance,Pathway,indication,KEGG
400478,CHEBI:41774,indication_CiPW,KEGG:hsa05224,,valid,ChemicalSubstance,Pathway,indication,KEGG
400479,CHEBI:45863,indication_CiPW,KEGG:hsa05224,,valid,ChemicalSubstance,Pathway,indication,KEGG
400480,CHEBI:50924,indication_CiPW,KEGG:hsa05211,,valid,ChemicalSubstance,Pathway,indication,KEGG


### Replace REACT:R-HSA-2160456 with DOID:9281.

In [23]:
graph.query('t=="REACT:R-HSA-2160456"').index

Int64Index([], dtype='int64')

In [24]:
graph.query('t=="REACT:R-HSA-2160456"')

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel,tsource


In [25]:
graph.loc()[9646746]

h                  MESH:D001555
r          associated_with_CawD
t                     DOID:9744
r2              associated_with
set                       train
hlabel        ChemicalSubstance
tlabel                  Disease
rel             associated_with
tsource                    DOID
Name: 9646746, dtype: object

In [26]:
for i in list(graph.query('t=="REACT:R-HSA-2160456"').index):
    graph.t.loc()[i] = "DOID:9281"
    graph.tlabel.loc()[i] = "Disease"

graph.query('t=="REACT:R-HSA-2160456"')

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel,tsource


## Generate new relation labels

In [27]:
node_types = {
    "ChemicalSubstance": "C",
    "MacromolecularMachine": "G",
    "BiologicalProcessOrActivity": "BP",
    "Disease": "D",
    "AnatomicalEntity": "A",
    "PhenotypicFeature": "P",
    "Pathway": "PW",
    "OrganismTaxon": "T",
    "GeneFamily": "F",
}

rel_type = {
    "inhibits": "in",
    "activates": "a",
    "affects": "af",
    "associated_with": "aw",
    "causes": "c",
    "disrupts": "d",
    "part_of": "po",
    "in_taxon": "it",
    "site_of": "so",
    "presents": "ps",
    "treats": "t",
    "regulates": "r",
    "capable_of": "co",
    "produces": "p",
    "has_input": "hi",
    "negatively_regulates": "nr",
    "positively_regulates": "pr",
    "in_reaction_with": "rx",
    "palliates": "pl",
    "prevents": "pv",
    "marker_or_mechanism": "m",
    "indication": "i",
}

In [28]:
graph.head()

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel,tsource
0,UNII:BTY153760O,inhibits_CinG,NCBIGene:3605,inhibits,train,ChemicalSubstance,MacromolecularMachine,inhibits,NCBIGene
1,NCBIGene:4116,part_of_GpoBP,GO:0003723,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity,part_of,GO
2,NCBIGene:4116,part_of_GpoBP,GO:0005515,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity,part_of,GO
3,NCBIGene:4116,part_of_GpoA,GO:0005634,part_of,train,MacromolecularMachine,AnatomicalEntity,part_of,GO
4,NCBIGene:4116,part_of_GpoA,GO:0005737,part_of,train,MacromolecularMachine,AnatomicalEntity,part_of,GO


In [29]:
graph["new_r"] = (
    graph["rel"]
    + "_"
    + graph["hlabel"].apply(lambda x: node_types[x])
    + graph["rel"].apply(lambda x: rel_type[x])
    + graph["tlabel"].apply(lambda x: node_types[x])
)

graph.head()

Unnamed: 0,h,r,t,r2,set,hlabel,tlabel,rel,tsource,new_r
0,UNII:BTY153760O,inhibits_CinG,NCBIGene:3605,inhibits,train,ChemicalSubstance,MacromolecularMachine,inhibits,NCBIGene,inhibits_CinG
1,NCBIGene:4116,part_of_GpoBP,GO:0003723,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity,part_of,GO,part_of_GpoBP
2,NCBIGene:4116,part_of_GpoBP,GO:0005515,part_of,train,MacromolecularMachine,BiologicalProcessOrActivity,part_of,GO,part_of_GpoBP
3,NCBIGene:4116,part_of_GpoA,GO:0005634,part_of,train,MacromolecularMachine,AnatomicalEntity,part_of,GO,part_of_GpoA
4,NCBIGene:4116,part_of_GpoA,GO:0005737,part_of,train,MacromolecularMachine,AnatomicalEntity,part_of,GO,part_of_GpoA


### Seperate into train, test, valid

In [30]:
graph[["h", "new_r", "t", "set"]].shape

(9658118, 4)

In [31]:
graph2=graph[['h','new_r','t','set']]

In [32]:
# sort values first prior to dropping duplicates
graph2["set"] = pd.Categorical(graph2["set"], ["train", "test", "valid"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  graph2["set"] = pd.Categorical(graph2["set"], ["train", "test", "valid"])


In [33]:
graph2 = (graph2
    .sort_values(by="set")
    .drop_duplicates(subset=["h", "new_r", "t"])
)

In [34]:
graph2.shape

(9658118, 4)

In [35]:
train2 = graph2.query('set=="train"')[["h", "new_r", "t"]]
test2 = graph2.query('set=="test"')[["h", "new_r", "t"]]
valid2 = graph2.query('set=="valid"')[["h", "new_r", "t"]]

### quick check on splits

In [36]:
pd.merge(
    left=train2, right=test2, on=["h", "new_r", "t"], how="outer", indicator=True
).groupby("_merge").count().reset_index()[["_merge", "h"]].rename(
    columns={"_merge": "dir", "h": "count"}
)

Unnamed: 0,dir,count
0,left_only,9657134
1,right_only,511
2,both,0


In [37]:
pd.merge(
    left=train2, right=valid2, on=["h", "new_r", "t"], how="outer", indicator=True
).groupby("_merge").count().reset_index()[["_merge", "h"]].rename(
    columns={"_merge": "dir", "h": "count"}
)

Unnamed: 0,dir,count
0,left_only,9657134
1,right_only,473
2,both,0


In [38]:
pd.merge(
    left=test2, right=valid2, on=["h", "new_r", "t"], how="outer", indicator=True
).groupby("_merge").count().reset_index()[["_merge", "h"]].rename(
    columns={"_merge": "dir", "h": "count"}
)

Unnamed: 0,dir,count
0,left_only,511
1,right_only,473
2,both,0


In [45]:
graph2.head()

Unnamed: 0,h,new_r,t,set
0,UNII:BTY153760O,inhibits_CinG,NCBIGene:3605,train
6838555,GO:0086100,associated_with_BPawD,DOID:3910,train
6838556,GO:0097009,associated_with_BPawD,DOID:0111704,train
6838557,GO:0097009,associated_with_BPawD,DOID:3910,train
6838558,GO:0086100,associated_with_BPawD,DOID:971,train


In [49]:
graph2.shape

(9658118, 4)

In [50]:
9658118-(511+473+9657134)

0

In [57]:
graph2['r'] = graph2.new_r.apply(lambda x: re.findall('[a-z]+',x)[0])

In [58]:
graph2.query('r=="indication"')

Unnamed: 0,h,new_r,t,set,r
9450278,CHEBI:22907,indication_CiD,DOID:3070,train,indication
9450279,CHEBI:22907,indication_CiD,DOID:8552,train,indication
9450280,CHEBI:2038,indication_CiD,DOID:8552,train,indication
9450281,CHEBI:2038,indication_CiD,DOID:3070,train,indication
9450282,CHEBI:2038,indication_CiD,DOID:10283,train,indication
...,...,...,...,...,...
159,CHEBI:36691,indication_CiD,DOID:50433,valid,indication
158,CHEBI:3997,indication_CiD,DOID:50952,valid,indication
157,CHEBI:32181,indication_CiD,DOID:8552,valid,indication
155,CHEBI:6444,indication_CiD,DOID:60145,valid,indication


In [59]:
graph2.query('new_r=="treats_CtD"')

Unnamed: 0,h,new_r,t,set,r
6824409,MESH:C474576,treats_CtD,DOID:9975,train,treats
6824411,MESH:C062458,treats_CtD,DOID:809,train,treats
6824412,MESH:C103477,treats_CtD,DOID:9975,train,treats
6824414,CHEBI:5784,treats_CtD,DOID:0070238,train,treats
6824408,MESH:C474576,treats_CtD,DOID:809,train,treats
...,...,...,...,...,...
3698585,IKEY:XXRCUYVCPSWGCC-UHFFFAOYSA-N,treats_CtD,MESH:D019446,train,treats
3698586,IKEY:XXRCUYVCPSWGCC-UHFFFAOYSA-N,treats_CtD,MESH:D007238,train,treats
3698556,CHEBI:4974,treats_CtD,MESH:D019694,train,treats
3698559,CHEBI:38677,treats_CtD,MESH:D002418,train,treats


In [60]:
graph2.query('r=="treats"')

Unnamed: 0,h,new_r,t,set,r
6824409,MESH:C474576,treats_CtD,DOID:9975,train,treats
6824411,MESH:C062458,treats_CtD,DOID:809,train,treats
6824412,MESH:C103477,treats_CtD,DOID:9975,train,treats
6824414,CHEBI:5784,treats_CtD,DOID:0070238,train,treats
6824408,MESH:C474576,treats_CtD,DOID:809,train,treats
...,...,...,...,...,...
3698586,IKEY:XXRCUYVCPSWGCC-UHFFFAOYSA-N,treats_CtD,MESH:D007238,train,treats
3698587,IKEY:XXRCUYVCPSWGCC-UHFFFAOYSA-N,treats_CtP,MESH:D007249,train,treats
3698556,CHEBI:4974,treats_CtD,MESH:D019694,train,treats
3698559,CHEBI:38677,treats_CtD,MESH:D002418,train,treats


In [48]:
graph2.groupby('new_r').count()

Unnamed: 0_level_0,h,t,set
new_r,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
activates_CaBP,38936,38936,38936
activates_CaG,472943,472943,472943
affects_CafBP,62523,62523,62523
affects_CafG,924308,924308,924308
associated_with_AawD,144217,144217,144217
...,...,...,...
site_of_AsoPW,2610,2610,2610
treats_CtD,65009,65009,65009
treats_CtP,3360,3360,3360
treats_GtD,2158,2158,2158


In [34]:
graph2.query('set=="train"')[["h", "new_r", "t"]].to_csv(
    "../data/MIND_CtD/train.txt", sep="\t", header=False, index=False
)
graph2.query('set=="test"')[["h", "new_r", "t"]].to_csv(
    "../data/MIND_CtD/test.txt", sep="\t", header=False, index=False
)
graph2.query('set=="valid"')[["h", "new_r", "t"]].to_csv(
    "../data/MIND_CtD/valid.txt", sep="\t", header=False, index=False
)