In [1]:
from zipfile import ZipFile
ns = {'db': 'http://www.drugbank.ca'}
import xml.etree.ElementTree as ET
from tqdm import tqdm
from Bio import Entrez
drug_ids = []
drug_names = []
cas_numbers = []
drugbank_filepath = '/project/reproduct_paper/biodata/data/sources/drugbank_all_full_database.xml.zip'
with ZipFile(drugbank_filepath, 'r') as dbzip:
    with dbzip.open('full database.xml', force_zip64=True) as xmlfile:
        for _, elem in tqdm(ET.iterparse(xmlfile), 'Processing Drugbank mapping'):
            # Check the length of the drug element as pathways also contain drug elements
            if elem.tag == '{http://www.drugbank.ca}drug' and len(elem) > 2:
                drug_id_elem = elem.find('./db:drugbank-id[@primary="true"]', ns)
                did = drug_id_elem.text
                # print(did)
                name_elem = elem.find('./db:name', ns)
                drug_names.append(name_elem.text)
                cas_number = elem.find('./db:cas-number', ns)
                if cas_number is not None:
                    drug_ids.append(did)
                    cas_numbers.append(cas_number.text)
                elem.clear()


Processing Drugbank mapping: 29846478it [01:41, 294747.29it/s]


In [93]:
# import pandas as pd
# # 创建一个字典，将列表对应到列名
# data = {
#     'drug_id': drug_ids,
#     'drug_name': drug_names,
#     'cas_number': cas_numbers
# }
# 
# # 创建DataFrame
# df = pd.DataFrame(data)
# 
# #将DataFrame保存为CSV文件
# df.to_csv('./drugbank_id_name_csv.csv',header=['drug_id','drug_name','cas_number'], index=False)
# df

Unnamed: 0,drug_id,drug_name,cas_number
0,DB00001,Lepirudin,138068-37-8
1,DB00002,Cetuximab,205923-56-4
2,DB00003,Dornase alfa,143831-71-4
3,DB00004,Denileukin diftitox,173146-27-5
4,DB00005,Etanercept,185243-69-0
...,...,...,...
16576,DB18713,Recombinant stabilized RSV A prefusion F antigen,
16577,DB18714,Recombinant stabilized RSV B prefusion F antigen,
16578,DB18715,Tolebrutinib,1971920-73-6
16579,DB18716,Enmetazobactam,1001404-83-6


In [3]:
import gzip
nb_entries = 0
chem_ids = []
chem_cas_numbers = []
chemical_id_mapping_fp = "/project/reproduct_paper/biodata/data/sources/CTD_chemicals.tsv.gz"
with gzip.open(chemical_id_mapping_fp, 'rt') as chem_map_fd:
    for line in chem_map_fd:
        if line.startswith('#'):
            continue
        parts = line.strip().split('\t')
        nb_entries += 1
        chem_id = parts[1].replace('MESH:', '')
        cas_number = parts[2]
        chem_ids.append(chem_id)
        chem_cas_numbers.append(cas_number)

In [5]:
drugbank_dict = dict(zip(drug_ids, cas_numbers))
ctd_dict = dict(zip( chem_cas_numbers,chem_ids))
drugid_name_dict = dict(zip( drug_ids,drug_names))
drugid_no_map = []
chemical_to_drugbank = {}
repeat_chemical_id = []
i=0
for drugid,casid in drugbank_dict.items():
    try:
        
        chem_mapid = ctd_dict[casid]
        if chem_mapid in chemical_to_drugbank:
            repeat_chemical_id.append(chem_mapid)
        else:
            chemical_to_drugbank[chem_mapid] = drugid
        i+=1
    except KeyError:
        drugid_no_map.append(drugid)
#去除重复的id
for repeatid in repeat_chemical_id:
    chemical_to_drugbank.pop(repeatid,None)
len(chemical_to_drugbank)

3808

In [6]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import random
import time

# Assuming chemical_to_drugbank and drugid_name_dict are predefined
chemical_to_drugbank1 = {}

# Function to process each drugid
def process_drug(drugid):
    url = "https://ctdbase.org/basicQuery.go?bqCat=chem&bq=" + drugid_name_dict[drugid]
    
    try:
        # Fetch the content from the URL
        response = requests.get(url)
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Extract the MeSH ID
        mesh_id = soup.find("a", href=lambda href: href and "mesh" in href).get_text()
        chemical_to_drugbank1[mesh_id] = drugid
    except (AttributeError, requests.RequestException):
        pass
    time.sleep(random.uniform(1, 2))

# Function to handle multi-threading
def process_all_drugs_concurrently(drugid_list, max_workers=30):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_drug, drugid): drugid for drugid in drugid_list}
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                future.result()  # Ensure any exceptions are raised
            except Exception as e:
                print(f"Error processing {futures[future]}: {e}")

# Run the multi-threaded processing
process_all_drugs_concurrently(drugid_no_map)

100%|██████████| 12756/12756 [19:21<00:00, 10.98it/s]


In [15]:
chemical_to_drugbank_merged_dict = {**chemical_to_drugbank1, **chemical_to_drugbank}
print(len(chemical_to_drugbank_merged_dict)) 

7205


In [17]:
import gzip
import csv

# 保存字典到压缩文件
with gzip.open('chemical_drugbank.txt.gz', 'wt', newline='') as gz_file:
    writer = csv.writer(gz_file, delimiter='\t')
    for key, value in chemical_to_drugbank_merged_dict.items():
        writer.writerow([key, value])