In [1]:
from tqdm import tqdm, trange
import pandas as pd

In [2]:
from searchapp.models import Entity, Label

In [3]:
df_nodes = pd.read_csv("../data/kg_nodes.csv", low_memory=False)

In [4]:
df_nodes

Unnamed: 0,id:ID,:LABEL,name,source,original_name
0,NCBI_9796_PHYHIP,Gene/Protein,phyhip,NCBI,PHYHIP
1,NCBI_7918_GPANK1,Gene/Protein,gpank1,NCBI,GPANK1
2,NCBI_8233_ZRSR2,Gene/Protein,zrsr2,NCBI,ZRSR2
3,NCBI_4899_NRF1,Gene/Protein,nrf1,NCBI,NRF1
4,NCBI_5297_PI4KA,Gene/Protein,pi4ka,NCBI,PI4KA
...,...,...,...,...,...
129370,REACTOME_R-HSA-936837_Ion_transport_by_P-type_...,Pathway,ion transport by p-type atpases,REACTOME,Ion transport by P-type ATPases
129371,REACTOME_R-HSA-997272_Inhibition__of_voltage_g...,Pathway,inhibition of voltage gated ca2+ channels via...,REACTOME,Inhibition of voltage gated Ca2+ channels via...
129372,UBERON_1062_anatomical_entity,Anatomy,anatomical entity,UBERON,anatomical entity
129373,UBERON_468_multi-cellular_organism,Anatomy,multi-cellular organism,UBERON,multi-cellular organism


In [5]:
df_nodes.drop_duplicates(subset=['id:ID'])

Unnamed: 0,id:ID,:LABEL,name,source,original_name
0,NCBI_9796_PHYHIP,Gene/Protein,phyhip,NCBI,PHYHIP
1,NCBI_7918_GPANK1,Gene/Protein,gpank1,NCBI,GPANK1
2,NCBI_8233_ZRSR2,Gene/Protein,zrsr2,NCBI,ZRSR2
3,NCBI_4899_NRF1,Gene/Protein,nrf1,NCBI,NRF1
4,NCBI_5297_PI4KA,Gene/Protein,pi4ka,NCBI,PI4KA
...,...,...,...,...,...
129370,REACTOME_R-HSA-936837_Ion_transport_by_P-type_...,Pathway,ion transport by p-type atpases,REACTOME,Ion transport by P-type ATPases
129371,REACTOME_R-HSA-997272_Inhibition__of_voltage_g...,Pathway,inhibition of voltage gated ca2+ channels via...,REACTOME,Inhibition of voltage gated Ca2+ channels via...
129372,UBERON_1062_anatomical_entity,Anatomy,anatomical entity,UBERON,anatomical entity
129373,UBERON_468_multi-cellular_organism,Anatomy,multi-cellular organism,UBERON,multi-cellular organism


# Import Label

In [6]:
label_data_list = df_nodes.drop_duplicates(subset=[':LABEL']).to_dict("records")
print(len(label_data_list))
label_data_list[:2]

10


[{'id:ID': 'NCBI_9796_PHYHIP',
  ':LABEL': 'Gene/Protein',
  'name': 'phyhip',
  'source': 'NCBI',
  'original_name': 'PHYHIP'},
 {'id:ID': 'DrugBank_DB09130_Copper',
  ':LABEL': 'Drug',
  'name': 'copper',
  'source': 'DrugBank',
  'original_name': 'Copper'}]

In [17]:
label_bulk_data = [Label(name=row.get(":LABEL")) for row in label_data_list]
label_bulk_data

[<Label: Gene/Protein>,
 <Label: Drug>,
 <Label: Effect/Phenotype>,
 <Label: Disease>,
 <Label: Biological_Process>,
 <Label: Molecular_Function>,
 <Label: Cellular_Component>,
 <Label: Exposure>,
 <Label: Pathway>,
 <Label: Anatomy>]

In [19]:
Label.objects.bulk_create(label_bulk_data)

[<Label: Gene/Protein>,
 <Label: Drug>,
 <Label: Effect/Phenotype>,
 <Label: Disease>,
 <Label: Biological_Process>,
 <Label: Molecular_Function>,
 <Label: Cellular_Component>,
 <Label: Exposure>,
 <Label: Pathway>,
 <Label: Anatomy>]

# Import Nodes

In [10]:
step = 1_000

In [11]:
nodes_list = df_nodes.to_dict("records")

In [13]:
nodes_list[:3]

[{'id:ID': 'NCBI_9796_PHYHIP',
  ':LABEL': 'Gene/Protein',
  'name': 'phyhip',
  'source': 'NCBI',
  'original_name': 'PHYHIP'},
 {'id:ID': 'NCBI_7918_GPANK1',
  ':LABEL': 'Gene/Protein',
  'name': 'gpank1',
  'source': 'NCBI',
  'original_name': 'GPANK1'},
 {'id:ID': 'NCBI_8233_ZRSR2',
  ':LABEL': 'Gene/Protein',
  'name': 'zrsr2',
  'source': 'NCBI',
  'original_name': 'ZRSR2'}]

In [12]:
data_len = len(df_nodes)
data_len

129375

In [20]:
for i in tqdm(range(0, data_len, step), desc="import nodes to django: Entity"):
    bulk_data = []
    for row in nodes_list[i: i+step]:
        label = Label.objects.get(name=row.get(":LABEL"))
        entity = Entity(
            label=label,
            id=row.get("id:ID"),
            name=row.get("original_name"),
            attribute={
                "source": row.get("source", None),
                "label": row.get(":LABEL")
            }
        )
        bulk_data.append(entity)
    Entity.objects.bulk_create(bulk_data)

import nodes to django: Entity: 100%|██████████| 130/130 [02:28<00:00,  1.14s/it]


In [21]:
# Label.objects.all().delete()

In [22]:
# Entity.objects.all().delete()