# Connect CHEMBL MOA to NSC numbers

## Init

In [1]:
import json
from os.path import join as path_join

import regex as re
import requests
from py2neo import Graph
from tqdm import tqdm

In [2]:
with open(path_join("results", "all_moa.json"), "r") as f:
    all_moa = json.load(f)

In [3]:
with open("config.json") as f:
    config = json.load(f)

neo4j_url = config.get("neo4jUrl", "bolt://localhost:7687")
user = config.get("user", "neo4j")
pswd = config.get("pswd", "password")
neo4j_import_loc = config["neo4j_import_loc"]

graph = Graph(neo4j_url, auth=(user, pswd))

## Get compound ID

In [47]:
def update_synonym(name):
    parsed_name = name.replace(" ", "%20")
    requests.get(
            f"http://127.0.0.1:81/updatePubchemSynonymsByName/?synonym_name={parsed_name}"
        )
all_moa_nsc = []
for moa in tqdm(all_moa):
    chembl_id = moa.get("chem_id")
    chembl_name = moa.get("chem_name")

    if chembl_id is None and chembl_name is None:
        continue

    chembl_id = chembl_id.replace("CHEMBL", "")
    update_synonym(f"CHEMBL{chembl_id}")
    update_synonym(f"CHEMBL {chembl_id}")
    update_synonym(f"CHEMBL-{chembl_id}")
    update_synonym(chembl_name)
    clean_chembl_name = " ".join(re.findall(r"[\p{L}\d]{2,}", chembl_name))
    update_synonym(clean_chembl_name)

    response = graph.run(
        """
        CALL {
            CALL db.index.fulltext.queryNodes('synonymsFullText', $lucense_query)
            YIELD node, score
            return node, score limit 10
        }
        MATCH (node)-[:IS_ATTRIBUTE_OF]->(c:Compound)
        WITH c.pubChemCompId as compoundId, collect(node)[0] as node, count(*) as num
        RETURN compoundId, node.name as syn_name, node.pubChemSynId as syn_id, num
        """, lucense_query=f"CHEMBL{chembl_id} OR (chembl AND {chembl_id}) OR ({clean_chembl_name})"
    ).data()
        
    if len(response) == 0:
        continue
    if len(response) == 1:
        selected_response = response[0]
        
    else:
        selected_response = max(response, key=lambda p: p['num'])
    moa["syn_name"] = response[0]["syn_name"]
    moa["syn_id"] = response[0]["syn_id"]
    all_moa_nsc.append(moa)

100%|██████████| 6259/6259 [6:11:15<00:00,  3.56s/it]   


In [48]:
with open(path_join("results", "all_moa_with_synonym.json"), "w") as f:
    json.dump(all_moa, f)

## Add NSC number

In [4]:
with open(path_join("results", "all_moa_with_synonym.json"), "r") as f:
    all_moa = json.load(f)

In [36]:
for moa in tqdm(all_moa):
    if (syn_id:= moa.get("syn_id")) is None:
        continue
    response = graph.run(
        """
        MATCH (s:Synonym {pubChemSynId: $synId})
        MATCH (s)-[:IS_ATTRIBUTE_OF]->(c:Compound)
        MATCH (c)<-[:IS_ATTRIBUTE_OF]-(s2:Synonym)
        WHERE s2.name CONTAINS "nsc"
        WITH toInteger(apoc.text.regexGroups(s2.name, "\d+")[0][0]) as num
        WHERE NOT num IS NULL
        RETURN DISTINCT num
        """, synId = syn_id
    ).data()
    if len(response) == 0:
        continue
    if len(response) > 1:
        moa["all_NSC"] = [i["num"] for i in response]
        continue
    moa["NSC"] = response[0]["num"]

100%|██████████| 6259/6259 [58:43<00:00,  1.78it/s]  


In [5]:
with open(path_join("results", "all_moa_with_NSC.json"), "w") as f:
    json.dump(all_moa, f)

In [52]:
def clean_name(name):
    clean_name = name.lower()
    chars_to_remove = ["(+/-)-","(+-)-", "(+)-", "(-)-", "(", ")", "+ "]
    for char in chars_to_remove:
        clean_name = clean_name.replace(char, "")

    chars_to_space = ["-"]
    for char in chars_to_space:
        clean_name = clean_name.replace(char, " ")
    return clean_name
for moa in all_moa:
    if not (moa.get("NSC") or moa.get("all_NSC")):
        continue
    
    raw_syn_name = moa.get("syn_name", "")
    raw_chem_name = moa.get("chem_name", "")
    raw_chem_id = moa.get("chem_id", "")
    
    syn_name = clean_name(raw_syn_name)
    chem_name = clean_name(raw_chem_name)
    chem_id = clean_name(raw_chem_id)

    if (syn_name != chem_name) and (syn_name != chem_id):
        print(f"{raw_syn_name} \t:\t {raw_chem_name} \t:\t {raw_chem_id}")

oxybate (sodium) 	:	 SODIUM OXYBATE 	:	 CHEMBL1200682
perhexiline maleate, (-)- 	:	 PERHEXILINE MALEATE 	:	 CHEMBL1334033
butoxide, piperonyl 	:	 PIPERONYL BUTOXIDE 	:	 CHEMBL1201131
butoconazole, butoconazole nitrate 	:	 BUTOCONAZOLE NITRATE 	:	 CHEMBL1200398
decamethonium  bromide 	:	 DECAMETHONIUM BROMIDE 	:	 CHEMBL1134
deferoxamine, deferoxamine mesylate 	:	 DEFEROXAMINE MESYLATE 	:	 CHEMBL1234
prednisolone phosphate sodium 	:	 PREDNISOLONE SODIUM PHOSPHATE 	:	 CHEMBL1201014
hydrocortisone butyrate (hydrocortisone 17-butyrate) 	:	 HYDROCORTISONE BUTYRATE 	:	 CHEMBL1683
3-chloro-alfa,alfa,alfa-trifluorotoluene 	:	 VELAGLUCERASE ALFA 	:	 CHEMBL1201865
silver sulfadiazine 	:	 SULFADIAZINE, SILVER 	:	 CHEMBL1382627
calcium trisodium pentetate 	:	 PENTETATE CALCIUM TRISODIUM 	:	 CHEMBL1200945
sodium thiamylal 	:	 THIAMYLAL SODIUM 	:	 CHEMBL1201065
sodium thiopental 	:	 THIOPENTAL SODIUM 	:	 CHEMBL738
disodium ticarcillin 	:	 TICARCILLIN DISODIUM 	:	 CHEMBL1200855
betamethasone valerate 

## Remove filtered chemicals

In [11]:
with open("incorrect_chembl_nsc_combi") as f:
    all_incorrect_chemicals = f.readlines()
all_incorrect_chembl_ids = []
for i in all_incorrect_chemicals:
    incorrect_chembl_ids = i.split(" \t:\t ")[2].replace("\n", "")
    all_incorrect_chembl_ids.append(incorrect_chembl_ids)

In [14]:
filtered_moa = []
for moa in all_moa:
    chem_id = moa.get("chem_id")
    if chem_id is None:
        continue
    if chem_id in     all_incorrect_chembl_ids:
        continue
    filtered_moa.append(moa)

print(len(filtered_moa))
print(len(all_moa))

5141
6259


In [15]:
with open(path_join("results", "all_moa_with_NSC_filtered.json"), "w") as f:
    json.dump(filtered_moa, f)