# Connect CHEMBL MOA to NSC numbers

## Init

In [13]:
import json
from os.path import join as path_join

import regex as re
import requests
from py2neo import Graph
from tqdm import tqdm

In [2]:
with open(path_join("results", "all_moa.json"), "r") as f:
    all_moa = json.load(f)

In [3]:
with open("config.json") as f:
    config = json.load(f)

neo4j_url = config.get("neo4jUrl", "bolt://localhost:7687")
user = config.get("user", "neo4j")
pswd = config.get("pswd", "password")
neo4j_import_loc = config["neo4j_import_loc"]

graph = Graph(neo4j_url, auth=(user, pswd))

## Get compound ID

In [47]:
def update_synonym(name):
    parsed_name = name.replace(" ", "%20")
    requests.get(
            f"http://127.0.0.1:81/updatePubchemSynonymsByName/?synonym_name={parsed_name}"
        )
all_moa_nsc = []
for moa in tqdm(all_moa):
    chembl_id = moa.get("chem_id")
    chembl_name = moa.get("chem_name")

    if chembl_id is None and chembl_name is None:
        continue

    chembl_id = chembl_id.replace("CHEMBL", "")
    update_synonym(f"CHEMBL{chembl_id}")
    update_synonym(f"CHEMBL {chembl_id}")
    update_synonym(f"CHEMBL-{chembl_id}")
    update_synonym(chembl_name)
    clean_chembl_name = " ".join(re.findall(r"[\p{L}\d]{2,}", chembl_name))
    update_synonym(clean_chembl_name)

    response = graph.run(
        """
        CALL {
            CALL db.index.fulltext.queryNodes('synonymsFullText', $lucense_query)
            YIELD node, score
            return node, score limit 10
        }
        MATCH (node)-[:IS_ATTRIBUTE_OF]->(c:Compound)
        WITH c.pubChemCompId as compoundId, collect(node)[0] as node, count(*) as num
        RETURN compoundId, node.name as syn_name, node.pubChemSynId as syn_id, num
        """, lucense_query=f"CHEMBL{chembl_id} OR (chembl AND {chembl_id}) OR ({clean_chembl_name})"
    ).data()
        
    if len(response) == 0:
        continue
    if len(response) == 1:
        selected_response = response[0]
        
    else:
        selected_response = max(response, key=lambda p: p['num'])
    moa["syn_name"] = response[0]["syn_name"]
    moa["syn_id"] = response[0]["syn_id"]
    all_moa_nsc.append(moa)

100%|██████████| 6259/6259 [6:11:15<00:00,  3.56s/it]   


In [48]:
with open(path_join("results", "all_moa_with_synonym.json"), "w") as f:
    json.dump(all_moa, f)

## Add NSC number

In [None]:
with open(path_join("results", "all_moa_with_synonym.json"), "r") as f:
    all_moa = json.load(f)

In [None]:
response = graph.run(
    """
    MATCH (s:Synonym {pubChemSynId: "a6ea5c34d27a943e59816bf307b0e8c1"})
    MATCH (s)-[:IS_ATTRIBUTE_OF]->(c:Compound)
    MATCH (c)<-[:IS_ATTRIBUTE_OF]-(s2:Synonym)
    WHERE s2.name CONTAINS "nsc"
    RETURN c, s2
    """
).data()
print(response)