In [3]:
import re
from typing import List
from tqdm import tqdm, trange
import pandas as pd

In [7]:
from searchapp.models import Entity, Label

In [8]:
df_nodes = pd.read_csv("../data/kg_nodes_id.csv", low_memory=False)

In [9]:
df_nodes

Unnamed: 0,id:ID,x_id,:LABEL,name,source,original_name
0,NCBI_9796_PHYHIP,9796,Gene/Protein,phyhip,NCBI,PHYHIP
1,NCBI_7918_GPANK1,7918,Gene/Protein,gpank1,NCBI,GPANK1
2,NCBI_8233_ZRSR2,8233,Gene/Protein,zrsr2,NCBI,ZRSR2
3,NCBI_4899_NRF1,4899,Gene/Protein,nrf1,NCBI,NRF1
4,NCBI_5297_PI4KA,5297,Gene/Protein,pi4ka,NCBI,PI4KA
...,...,...,...,...,...,...
129370,REACTOME_R-HSA-936837_Ion_transport_by_P-type_...,R-HSA-936837,Pathway,ion transport by p-type atpases,REACTOME,Ion transport by P-type ATPases
129371,REACTOME_R-HSA-997272_Inhibition__of_voltage_g...,R-HSA-997272,Pathway,inhibition of voltage gated ca2+ channels via...,REACTOME,Inhibition of voltage gated Ca2+ channels via...
129372,UBERON_1062_anatomical_entity,1062,Anatomy,anatomical entity,UBERON,anatomical entity
129373,UBERON_468_multi-cellular_organism,468,Anatomy,multi-cellular organism,UBERON,multi-cellular organism


In [10]:
df_nodes.drop_duplicates(subset=['id:ID'])

Unnamed: 0,id:ID,x_id,:LABEL,name,source,original_name
0,NCBI_9796_PHYHIP,9796,Gene/Protein,phyhip,NCBI,PHYHIP
1,NCBI_7918_GPANK1,7918,Gene/Protein,gpank1,NCBI,GPANK1
2,NCBI_8233_ZRSR2,8233,Gene/Protein,zrsr2,NCBI,ZRSR2
3,NCBI_4899_NRF1,4899,Gene/Protein,nrf1,NCBI,NRF1
4,NCBI_5297_PI4KA,5297,Gene/Protein,pi4ka,NCBI,PI4KA
...,...,...,...,...,...,...
129370,REACTOME_R-HSA-936837_Ion_transport_by_P-type_...,R-HSA-936837,Pathway,ion transport by p-type atpases,REACTOME,Ion transport by P-type ATPases
129371,REACTOME_R-HSA-997272_Inhibition__of_voltage_g...,R-HSA-997272,Pathway,inhibition of voltage gated ca2+ channels via...,REACTOME,Inhibition of voltage gated Ca2+ channels via...
129372,UBERON_1062_anatomical_entity,1062,Anatomy,anatomical entity,UBERON,anatomical entity
129373,UBERON_468_multi-cellular_organism,468,Anatomy,multi-cellular organism,UBERON,multi-cellular organism


# Import Label

In [11]:
label_data_list = df_nodes.drop_duplicates(subset=[':LABEL']).to_dict("records")
print(len(label_data_list))
label_data_list

10


[{'id:ID': 'NCBI_9796_PHYHIP',
  'x_id': '9796',
  ':LABEL': 'Gene/Protein',
  'name': 'phyhip',
  'source': 'NCBI',
  'original_name': 'PHYHIP'},
 {'id:ID': 'DrugBank_DB05282_MCC',
  'x_id': '4163',
  ':LABEL': 'Drug',
  'name': 'mcc',
  'source': 'DrugBank',
  'original_name': 'MCC'},
 {'id:ID': 'CTD_D003634_DDT',
  'x_id': '1652',
  ':LABEL': 'Exposure',
  'name': 'ddt',
  'source': 'CTD',
  'original_name': 'DDT'},
 {'id:ID': 'REACTOME_R-HSA-8877627_Vitamin_E',
  'x_id': 'DB00163',
  ':LABEL': 'Pathway',
  'name': 'vitamin e',
  'source': 'REACTOME',
  'original_name': 'Vitamin E'},
 {'id:ID': 'HPO_1507_Growth_abnormality',
  'x_id': '1507',
  ':LABEL': 'Effect/Phenotype',
  'name': 'growth abnormality',
  'source': 'HPO',
  'original_name': 'Growth abnormality'},
 {'id:ID': 'MONDO_22330_4-hydroxyphenylacetic_aciduria',
  'x_id': '3607',
  ':LABEL': 'Disease',
  'name': '4-hydroxyphenylacetic aciduria',
  'source': 'MONDO',
  'original_name': '4-hydroxyphenylacetic aciduria'},
 {'i

In [12]:
def create_labels(data) -> List[Label]:
    source_site_mapping = {
        "HPO": "https://hpo.jax.org/app/browse/term/",
        "DRUGBANK": "https://go.drugbank.com/drugs/",
        "MONDO":  "http://purl.obolibrary.org/obo/MONDO_",
        "REACTOME": "https://reactome.org/content/detail/",
        "UBERON":  "http://purl.obolibrary.org/obo/UBERON_",
        "NCBI": "https://www.ncbi.nlm.nih.gov/gene/",
        "GO": "https://www.ebi.ac.uk/QuickGO/term/GO:",
        "CTD": "https://ctdbase.org/detail.go?type=chem&acc="
    }
    
    label_bulk_data = []
    for row in data:
        name: str = row.get(":LABEL")
        source: str = row.get("source")
        site_url = source_site_mapping.get(source.upper())
        
        assert site_url is not None, "site url cannot be Null"
        
        label: Label = Label(
            name=row.get(":LABEL"),
            attribute={
                "source": row.get("source"),
                "url": site_url
            }
        ) 
        label_bulk_data.append(label)
    
    return label_bulk_data

In [13]:
label_bulk_data = create_labels(label_data_list)
label_bulk_data

[<Label: Gene/Protein>,
 <Label: Drug>,
 <Label: Exposure>,
 <Label: Pathway>,
 <Label: Effect/Phenotype>,
 <Label: Disease>,
 <Label: Anatomy>,
 <Label: Biological_Process>,
 <Label: Molecular_Function>,
 <Label: Cellular_Component>]

In [50]:
Label.objects.bulk_create(label_bulk_data)

[<Label: Gene/Protein>,
 <Label: Drug>,
 <Label: Exposure>,
 <Label: Pathway>,
 <Label: Effect/Phenotype>,
 <Label: Disease>,
 <Label: Anatomy>,
 <Label: Biological_Process>,
 <Label: Molecular_Function>,
 <Label: Cellular_Component>]

# Import Nodes

In [51]:
step = 1_000

In [52]:
nodes_list = df_nodes.to_dict("records")

In [53]:
nodes_list[:3]

[{'id:ID': 'NCBI_9796_PHYHIP',
  'x_id': '9796',
  ':LABEL': 'Gene/Protein',
  'name': 'phyhip',
  'source': 'NCBI',
  'original_name': 'PHYHIP'},
 {'id:ID': 'NCBI_7918_GPANK1',
  'x_id': '7918',
  ':LABEL': 'Gene/Protein',
  'name': 'gpank1',
  'source': 'NCBI',
  'original_name': 'GPANK1'},
 {'id:ID': 'NCBI_8233_ZRSR2',
  'x_id': '8233',
  ':LABEL': 'Gene/Protein',
  'name': 'zrsr2',
  'source': 'NCBI',
  'original_name': 'ZRSR2'}]

In [34]:
data_len = len(df_nodes)
data_len

129375

In [57]:
for i in tqdm(range(0, data_len, step), desc="import nodes to django: Entity"):
    bulk_data = []
    for row in nodes_list[i: i+step]:
        label = Label.objects.get(name=row.get(":LABEL"))
#         node_id = re.sub(r'[{}\"\'=:]', "_", str(row.get("id:ID")))
#         node_name = re.sub(r'[{}\"\'=:]', " ", str(row.get("original_name")))
        node_id = row.get("id:ID")
        node_raw_id = row.get("x_id")
        node_name = row.get("original_name")
        node_source_name = row.get("source")
        node_label_name = row.get(":LABEL")
        
        entity = Entity(
            label=label,
            id=node_id,
            name=node_name,
            attribute={
                "source":node_source_name,
                "label": node_label_name,
                "id": node_raw_id,
                "source_url": label.attribute.get("url")
            }
        )
        bulk_data.append(entity)
    Entity.objects.bulk_create(bulk_data)

import nodes to django: Entity: 100%|██████████| 130/130 [02:35<00:00,  1.20s/it]


In [58]:
Label.objects.count()

10

In [59]:
Entity.objects.count()

129375

In [60]:
# Label.objects.all().delete()

In [61]:
# Link.objects.all().delete()

In [62]:
# Entity.objects.all().delete()

In [63]:
Label.objects.all().first()

<Label: Gene/Protein>

In [65]:
entity = Entity.objects.all().first()
print(f'{entity.attribute.get("source_url")}{entity.attribute.get("id")}')

https://ctdbase.org/detail.go?type=chem&acc=C000188
