# Learning a Patent-Informed Biomedical Knowledge Graph Reveals Technological Potential of Drug Repositioning Candidates

## Import Library

In [1]:
import pandas as pd
import itertools as it
import pymysql
import numpy as np
import datetime
from sqlalchemy import create_engine
pymysql.install_as_MySQLdb()
import MySQLdb
import networkx as nx
import pymysql
import xml.etree.ElementTree as ET
from tqdm import tqdm
import requests
import json
import time
import csv
import re
import os
import warnings
import torch
import pickle
from node2vec import Node2Vec

## DB Connection

In [2]:
# First, you must construct dgidb and disgenet db
# Change below codes for your environment

import os
import pymysql

host = os.getenv("DB_HOST")
port = int(os.getenv("DB_PORT"))
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
db_name = os.getenv("DB_NAME")
charset = os.getenv("DB_CHARSET", "utf8mb4")

conn = pymysql.connect(
    host=host,
    port=port,
    user=user,
    password=password,
    db=db_name,
    charset=charset,
)
cursor = conn.cursor(pymysql.cursors.DictCursor)


## DGIdb

In [3]:
# Connect DGIdb

cursor.execute('''SELECT * FROM db_name.dgidb''')
data = cursor.fetchall()

In [4]:
dgidb = pd.DataFrame(data)

In [5]:
# all edges
dgidb.head()

Unnamed: 0,drug_concept_id,entrez_id,interaction_types,interaction_group_score
0,CHEMBL10,1432,,0.13
1,CHEMBL10,1452,,0.14
2,CHEMBL10,1453,,0.11
3,CHEMBL10,1557,,0.01
4,CHEMBL10,1559,,0.01


In [6]:
# Connect each data with CHEMBL db

cursor.execute('''SELECT * FROM db_name.dgidb_drug_chembl''')
nodes = cursor.fetchall()

In [7]:
dgidb_drug = pd.DataFrame(nodes)

In [8]:
# all drug nodes
dgidb_drug.head()

Unnamed: 0,concept_id,drug_claim_name,drug_name,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,...,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations
0,CHEMBL10,SB-203580,SB-203580,377.44,4.68,3.0,1.0,58.64,4.0,N,...,3.14,NEUTRAL,377.44,4.0,27.0,0.56,377.0998,4.0,1.0,0.0
1,CHEMBL1000,chembl:CHEMBL1000,CETIRIZINE,388.9,3.15,4.0,1.0,53.01,8.0,N,...,0.65,ACID,388.9,2.0,27.0,0.7,388.1554,5.0,1.0,0.0
2,CHEMBL100014,AN-9,AN-9,202.25,1.88,4.0,0.0,52.6,4.0,N,...,2.73,,202.25,0.0,14.0,0.52,202.1205,4.0,0.0,0.0
3,CHEMBL100109,chembl:CHEMBL100109,CHEMBL100109,432.33,,,,,,,...,,,432.33,,,,432.2544,,,
4,CHEMBL100116,PENTAZOCINE,PENTAZOCINE,285.43,3.88,2.0,1.0,23.47,2.0,N,...,1.99,BASE,285.43,1.0,21.0,0.83,285.2093,2.0,1.0,0.0


In [9]:
dgidb

Unnamed: 0,drug_concept_id,entrez_id,interaction_types,interaction_group_score
0,CHEMBL10,1432,,0.13
1,CHEMBL10,1452,,0.14
2,CHEMBL10,1453,,0.11
3,CHEMBL10,1557,,0.01
4,CHEMBL10,1559,,0.01
...,...,...,...,...
45767,CHEMBL998,6863,,0.54
45768,CHEMBL998,7253,,0.04
45769,CHEMBL99946,6530,,0.87
45770,CHEMBL99946,6531,inhibitor,0.35


In [10]:
dgidb_drug

Unnamed: 0,concept_id,drug_claim_name,drug_name,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,...,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations
0,CHEMBL10,SB-203580,SB-203580,377.44,4.68,3.0,1.0,58.64,4.0,N,...,3.14,NEUTRAL,377.44,4.0,27.0,0.56,377.0998,4.0,1.0,0.0
1,CHEMBL1000,chembl:CHEMBL1000,CETIRIZINE,388.90,3.15,4.0,1.0,53.01,8.0,N,...,0.65,ACID,388.90,2.0,27.0,0.70,388.1554,5.0,1.0,0.0
2,CHEMBL100014,AN-9,AN-9,202.25,1.88,4.0,0.0,52.60,4.0,N,...,2.73,,202.25,0.0,14.0,0.52,202.1205,4.0,0.0,0.0
3,CHEMBL100109,chembl:CHEMBL100109,CHEMBL100109,432.33,,,,,,,...,,,432.33,,,,432.2544,,,
4,CHEMBL100116,PENTAZOCINE,PENTAZOCINE,285.43,3.88,2.0,1.0,23.47,2.0,N,...,1.99,BASE,285.43,1.0,21.0,0.83,285.2093,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13008,CHEMBL998,loratadine,LORATADINE,382.89,4.89,3.0,0.0,42.43,1.0,N,...,4.55,NEUTRAL,382.89,2.0,27.0,0.70,382.1448,4.0,0.0,0.0
13009,CHEMBL99842,chembl:CHEMBL99842,CHEMBL99842,267.29,2.06,5.0,1.0,72.18,2.0,N,...,0.83,NEUTRAL,267.29,3.0,20.0,0.77,267.1120,6.0,1.0,0.0
13010,CHEMBL99889,chembl:CHEMBL99889,CHEMBL99889,443.91,3.28,3.0,3.0,85.43,5.0,N,...,2.47,NEUTRAL,443.91,3.0,31.0,0.57,443.1412,6.0,3.0,0.0
13011,CHEMBL99896,chembl:CHEMBL99896,CHEMBL99896,370.86,4.02,3.0,2.0,40.52,5.0,N,...,3.97,ACID,370.86,2.0,24.0,0.47,370.0831,4.0,1.0,0.0


In [11]:
cursor.execute('''SELECT * FROM db_name.dgidb_genes''')
genes = cursor.fetchall()

In [12]:
dgidb_gene = pd.DataFrame(genes)

In [13]:
# all gene nodes
dgidb_gene.head()

Unnamed: 0,entrez_id,gene_claim_name,gene_name
0,1,ENSG00000121410,A1BG
1,10,NAT2,NAT2
2,100,ENSG00000196839,ADA
3,1000,CDH2,CDH2
4,10000,AKT3,AKT3


In [14]:
dgidb_gene

Unnamed: 0,entrez_id,gene_claim_name,gene_name
0,1,ENSG00000121410,A1BG
1,10,NAT2,NAT2
2,100,ENSG00000196839,ADA
3,1000,CDH2,CDH2
4,10000,AKT3,AKT3
...,...,...,...
37464,9992,ENSG00000159197,KCNE2
37465,9993,ENSG00000070413,DGCR2
37466,9994,ENSG00000118412,CASP8AP2
37467,9995,ENSG00000234402,ELK2BP


## DisGeNET

In [15]:
cursor.execute('''SELECT * FROM db_name.disgenet''')
data = cursor.fetchall()

In [16]:
disgenet = pd.DataFrame(data)

In [17]:
disgenet.head()

Unnamed: 0,NID,diseaseId,geneId,association,associationType,score,EL,EI,year
0,1,C0000727,1468,,Biomarker,0.01,,1.0,2018
1,10,C0000731,1482,,Biomarker,0.1,,0.0,0
2,100,C0000731,8022,,Biomarker,0.1,,0.0,0
3,1000,C0000768,1718,,AlteredExpression,0.02,,1.0,2018
4,10000,C0001344,3553,,AlteredExpression,0.01,,1.0,2018


In [18]:
cursor.execute('''SELECT * FROM db_name.disgenet_geneattributes''')
data = cursor.fetchall()

In [19]:
disgenet_gene = pd.DataFrame(data)

In [20]:
disgenet_gene.head()

Unnamed: 0,geneNID,geneId,geneName,geneDescription,pLI,DSI,DPI
0,1,1,A1BG,alpha-1-B glycoprotein,4.9917e-09,0.7,0.538
1,5,10,NAT2,N-acetyltransferase 2,3.2744e-06,0.451,0.885
2,68,100,ADA,adenosine deaminase,2.8825e-12,0.44,0.885
3,751,1000,CDH2,cadherin 2,0.99171,0.535,0.731
4,7138,10000,AKT3,AKT serine/threonine kinase 3,0.99958,0.512,0.808


In [21]:
disgenet_gene

Unnamed: 0,geneNID,geneId,geneName,geneDescription,pLI,DSI,DPI
0,1,1,A1BG,alpha-1-B glycoprotein,4.991700e-09,0.700,0.538
1,5,10,NAT2,N-acetyltransferase 2,3.274400e-06,0.451,0.885
2,68,100,ADA,adenosine deaminase,2.882500e-12,0.440,0.885
3,751,1000,CDH2,cadherin 2,9.917100e-01,0.535,0.731
4,7138,10000,AKT3,AKT serine/threonine kinase 3,9.995800e-01,0.512,0.808
...,...,...,...,...,...,...,...
26132,7133,9991,PTBP3,polypyrimidine tract binding protein 3,8.629500e-01,0.751,0.154
26133,7134,9992,KCNE2,potassium voltage-gated channel subfamily E re...,1.014500e-04,0.628,0.423
26134,7135,9993,DGCR2,DiGeorge syndrome critical region gene 2,5.157300e-06,0.623,0.731
26135,7136,9994,CASP8AP2,caspase 8 associated protein 2,0.000000e+00,0.659,0.500


In [22]:
cursor.execute('''SELECT * FROM db_name.disgenet_disease''')
data = cursor.fetchall()

In [23]:
disgenet_disease = pd.DataFrame(data)

In [24]:
disgenet_disease.head()

Unnamed: 0,diseaseNID,diseaseId,diseaseName,diseaseType,diseaseSemanticType,NofGenes,NofPmids
0,1,C0000727,"Abdomen, Acute",phenotype,Sign or Symptom,2,2
1,2,C0000729,Abdominal Cramps,phenotype,Sign or Symptom,1,1
2,3,C0000731,Abdomen distended,phenotype,Finding,103,0
3,4,C0000734,Abdominal mass,phenotype,Finding,2,0
4,5,C0000735,Abdominal Neoplasms,group,Neoplastic Process,13,13


In [25]:
disgenet_disease

Unnamed: 0,diseaseNID,diseaseId,diseaseName,diseaseType,diseaseSemanticType,NofGenes,NofPmids
0,1,C0000727,"Abdomen, Acute",phenotype,Sign or Symptom,2,2
1,2,C0000729,Abdominal Cramps,phenotype,Sign or Symptom,1,1
2,3,C0000731,Abdomen distended,phenotype,Finding,103,0
3,4,C0000734,Abdominal mass,phenotype,Finding,2,0
4,5,C0000735,Abdominal Neoplasms,group,Neoplastic Process,13,13
...,...,...,...,...,...,...,...
30165,30289,C4755314,Autosomal recessive cutis laxa type 2B,disease,Disease or Syndrome,1,1
30166,30290,C4757950,Isolated ATP synthase deficiency,disease,Disease or Syndrome,8,10
30167,30291,C4757951,Desmoplastic infantile astrocytoma and ganglio...,disease,Neoplastic Process,1,1
30168,30292,C4759295,Non-metastatic prostate cancer,disease,Neoplastic Process,18,32


## s-BKG Construction

In [23]:
# Create an empty graph
G = nx.Graph()

# Add gene nodes from dgidb_gene and disgenet_gene
for i, row in disgenet_gene.iterrows():
    gene_id = row['geneId']
    if gene_id not in G:
        G.add_node(gene_id, type='gene', **row.to_dict())
for i, row in dgidb_gene.iterrows():
    gene_id = row['entrez_id'] # geneId, entrez_id same index
    if gene_id not in G:
        G.add_node(gene_id, type='gene', **row.to_dict())
        
# Add disease nodes from disgenet_disease
for i, row in disgenet_disease.iterrows():
    disease_id = row['diseaseId']
    if disease_id not in G:
        G.add_node(disease_id, type='disease', **row.to_dict())

# Add drug nodes from dgidb_drug
for i, row in dgidb_drug.iterrows():
    drug_id = row['concept_id']
    if drug_id not in G:
        G.add_node(drug_id, type='drug', **row.to_dict())

# Add gene-disease edges from disgenet_gene_disease
for i, row in disgenet.iterrows():
    gene_id = row['geneId']
    disease_id = row['diseaseId']
    asso_type = row['associationType']
    if gene_id in G and disease_id in G:
        G.add_edge(gene_id, disease_id, type=asso_type, **row.to_dict())

# Add gene-drug edges from dgidb_gene_drug
for i, row in dgidb.iterrows():
    gene_id = row['entrez_id']
    drug_id = row['drug_concept_id']
    interaction_type = row['interaction_types']
    if gene_id in G and drug_id in G:
        G.add_edge(gene_id, drug_id, type='dgidb_'+interaction_type, **row.to_dict())

In [181]:
G.number_of_nodes()

85470

In [182]:
G.number_of_edges()

1178058

In [29]:
# lists for drug nodes, disease nodes and gene nodes

drug_nodes = [n for n,d in G.nodes(data=True) if d["type"] == "drug"]
disease_nodes = [n for n,d in G.nodes(data=True) if d["type"] == "disease"]
gene_nodes = [n for n,d in G.nodes(data=True) if d["type"] == "gene"]

## Applying Node2Vec

In [25]:
model = Node2Vec(G, dimensions=15, walk_length=5, num_walks=100, workers=1)

Computing transition probabilities:   0%|          | 0/85470 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████| 100/100 [1:12:14<00:00, 43.34s/it]


In [17]:
# Train the model
model_n2v = model.fit(window=5)

In [27]:
embeddings = {}

In [28]:
for word in G.nodes():
    embeddings[word] = model_n2v.wv[word]

## Screening drug repositioning candidates with s-BKG

In [39]:
# List of therpeutic drugs for Alzheimer's Disease

ctd = pd.read_csv("ctd_file.csv")

In [40]:
ctd

Unnamed: 0,Chemical Name,Chemical ID,CAS RN,Direct Evidence,Inference Network,Inference Score,Reference Count,drug_concept_id
0,Folic Acid,D005492,59-30-3,marker/mechanism|therapeutic,ABCA7|ACE|ACHE|ADAMTS1|APH1B|APOE|APP|ARC|BACE...,31.03,101,CHEMBL1622
1,Lithium,D008094,7439-93-2,marker/mechanism|therapeutic,APP|BAX|BCL2|BDNF|CASP3|GSK3B|HMOX1|IL1B|MAPT|...,26.57,55,CHEMBL2146126
2,Vitamin B 12,D014805,68-19-9,marker/mechanism|therapeutic,APP|CASP3|IL1B|MAOB|MTHFR|PPARG|PSEN1|TNF,16.10,53,x
3,Melatonin,D008550,73-31-4,therapeutic,ACHE|APP|BACE1|BAX|BCL2|BDNF|CALM1|CASP3|EIF2S...,59.34,67,CHEMBL45
4,Curcumin,D003474,458-37-7,therapeutic,ACE|ACHE|APOE|APP|BACE1|BAX|BCL2|BDNF|CASP3|CY...,53.84,76,CHEMBL140
...,...,...,...,...,...,...,...,...
72,Molindone,D008972,7416-34-4,therapeutic,,,1,CHEMBL460
73,nirogacestat,C550722,,therapeutic,,,1,CHEMBL1770916
74,Perphenazine,D010546,58-39-9,therapeutic,,,1,CHEMBL567
75,Pindolol,D010869,13523-86-9,therapeutic,,,1,CHEMBL500


In [42]:
# The threshold is 0.7
# C0002395 means AD

similarity_scores_07df = []
for drug_node in drug_nodes:
    similarity_score = np.dot(embeddings[drug_node], embeddings['C0002395'])/(np.linalg.norm(embeddings[drug_node])*np.linalg.norm(embeddings['C0002395']))
    if similarity_score >= 0.7:
        similarity_scores_07df.append([drug_node, similarity_score, embeddings[drug_node]])
        
# Store the similarity scores in a pandas DataFrame
similarity_scores_07df = pd.DataFrame(similarity_scores_07df, columns=["drug_concept_id", "cosine_similarity", 'embedding_vec'])

# Sort the DataFrame by the similarity score in descending order
similarity_scores_07df.sort_values(by="cosine_similarity", ascending=False, inplace=True)

In [43]:
similarity_scores_07df

Unnamed: 0,drug_concept_id,cosine_similarity,embedding_vec
92,CHEMBL254836,0.918763,"[-0.04922188, 0.016398478, 0.033988, -0.036887..."
173,CHEMBL4298172,0.864723,"[-0.8197736, 0.51376766, 0.80798775, -0.311405..."
82,CHEMBL225411,0.821627,"[-0.48495638, 0.58420175, 0.6934009, -0.298105..."
117,CHEMBL3527358,0.808408,"[-0.7783383, 0.6663578, 0.8943033, 0.13076878,..."
167,CHEMBL4297504,0.807645,"[-0.66165805, 0.5607038, 0.93603456, -0.029648..."
...,...,...,...
122,CHEMBL357076,0.701239,"[-0.1970351, 0.73980576, 0.3411201, -0.8149018..."
83,CHEMBL2311194,0.701226,"[-0.35792848, 0.074165925, 0.8919105, -1.05470..."
46,CHEMBL185515,0.701110,"[-0.08127207, -0.17383432, 0.20218584, -0.2289..."
32,CHEMBL1677,0.700845,"[-0.16574568, 0.5572725, 0.5874922, -1.1577334..."


In [45]:
# Three drugs are therapeutic drugs

pd.merge(similarity_scores_07df, ctd, on='drug_concept_id')

Unnamed: 0,drug_concept_id,cosine_similarity,embedding_vec,Chemical Name,Chemical ID,CAS RN,Direct Evidence,Inference Network,Inference Score,Reference Count
0,CHEMBL1090771,0.726821,"[-0.99992114, 0.51871026, 1.3514118, -0.641754...",BMS 708163,C554092,,therapeutic,APP,4.01,38
1,CHEMBL247471,0.717062,"[-0.8848872, 0.9137875, 1.2488608, -0.8595964,...","4-(2-((1R)-1-(((4-chlorophenyl)sulfonyl)-2,5-d...",C497115,,therapeutic,,,1
2,CHEMBL54,0.703278,"[-0.93252087, 0.9202858, -0.004359702, -1.2978...",Haloperidol,D006220,52-86-8,therapeutic,APOC1|ARC|BAX|BCL2|BDNF|CASP3|CRH|CST3|CYP2D6|...,20.57,25


In [48]:
dr_candidates = df.query('_merge == "left_only" ').drop(columns=['_merge']).iloc[:, [0,1]]

In [79]:
# Screening candidates

dr_candidates

Unnamed: 0,drug_concept_id,cosine_similarity
0,CHEMBL254836,0.918763
1,CHEMBL4298172,0.864723
2,CHEMBL225411,0.821627
3,CHEMBL3527358,0.808408
4,CHEMBL4297504,0.807645
...,...,...
227,CHEMBL357076,0.701239
228,CHEMBL2311194,0.701226
229,CHEMBL185515,0.701110
230,CHEMBL1677,0.700845


In [28]:
merged_score07 = pd.merge(similarity_scores_07df, ctd, on='drug_concept_id')

In [29]:
merged_score07

Unnamed: 0,drug_concept_id,cosine_similarity,embedding_vec,Chemical Name,Chemical ID,CAS RN,Direct Evidence,Inference Network,Inference Score,Reference Count
0,CHEMBL1090771,0.726821,"[-0.99992114, 0.51871026, 1.3514118, -0.641754...",BMS 708163,C554092,,therapeutic,APP,4.01,38
1,CHEMBL247471,0.717062,"[-0.8848872, 0.9137875, 1.2488608, -0.8595964,...","4-(2-((1R)-1-(((4-chlorophenyl)sulfonyl)-2,5-d...",C497115,,therapeutic,,,1
2,CHEMBL54,0.703278,"[-0.93252087, 0.9202858, -0.004359702, -1.2978...",Haloperidol,D006220,52-86-8,therapeutic,APOC1|ARC|BAX|BCL2|BDNF|CASP3|CRH|CST3|CYP2D6|...,20.57,25


In [46]:
df = pd.merge(similarity_scores_07df, ctd, how='outer', indicator=True)

In [47]:
df

Unnamed: 0,drug_concept_id,cosine_similarity,embedding_vec,Chemical Name,Chemical ID,CAS RN,Direct Evidence,Inference Network,Inference Score,Reference Count,_merge
0,CHEMBL254836,0.918763,"[-0.04922188, 0.016398478, 0.033988, -0.036887...",,,,,,,,left_only
1,CHEMBL4298172,0.864723,"[-0.8197736, 0.51376766, 0.80798775, -0.311405...",,,,,,,,left_only
2,CHEMBL225411,0.821627,"[-0.48495638, 0.58420175, 0.6934009, -0.298105...",,,,,,,,left_only
3,CHEMBL3527358,0.808408,"[-0.7783383, 0.6663578, 0.8943033, 0.13076878,...",,,,,,,,left_only
4,CHEMBL4297504,0.807645,"[-0.66165805, 0.5607038, 0.93603456, -0.029648...",,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
301,CHEMBL460,,,Molindone,D008972,7416-34-4,therapeutic,,,1.0,right_only
302,CHEMBL1770916,,,nirogacestat,C550722,,therapeutic,,,1.0,right_only
303,CHEMBL567,,,Perphenazine,D010546,58-39-9,therapeutic,,,1.0,right_only
304,CHEMBL500,,,Pindolol,D010869,13523-86-9,therapeutic,,,1.0,right_only


In [None]:
dr_candidates = df.query('_merge == "left_only" ').drop(columns=['_merge']).iloc[:, [0,1]]

In [49]:
dr_candidates

Unnamed: 0,drug_concept_id,cosine_similarity
0,CHEMBL254836,0.918763
1,CHEMBL4298172,0.864723
2,CHEMBL225411,0.821627
3,CHEMBL3527358,0.808408
4,CHEMBL4297504,0.807645
...,...,...
227,CHEMBL357076,0.701239
228,CHEMBL2311194,0.701226
229,CHEMBL185515,0.701110
230,CHEMBL1677,0.700845


## Disease MESH MAPPING (If needed)

- Diseases provided by Disgenet are provided as a UMLS CUI. Therefore, we convert them to MeSH codes. (because the entity extraction is MeSH code)
- Utilising Disgenet API REQUEST

In [None]:
#For this example we are going to use the python default http library
import requests

#Build a dict with the following format, change the value of the two keys your DisGeNET account credentials, if you don't have an account you can create one here https://www.disgenet.org/signup/ 
auth_params = {"email":"id","password":"password"}

api_host = "https://www.disgenet.org/api"

api_key = "your key"
s = requests.Session()
try:
    r = s.post(api_host+'/auth/', data=auth_params)
    if(r.status_code == 200):
        #Lets store the api key in a new variable and use it again in new requests
        json_response = r.json()
        api_key = json_response.get("token")
    else:
        print(r.status_code)
        print(r.text)
except requests.exceptions.RequestException as req_ex:
    print(req_ex)
    print("Something went wrong with the request.")

if api_key:
    #Add the api key to the requests headers of the requests Session object in order to use the restricted endpoints.
    s.headers.update({"Authorization": "Bearer %s" % api_key}) 
    gda_response = s.get(api_host+'/disease/mesh/D005334')
    print(gda_response.json())

if s:
    s.close()

In [67]:
disease_info

Unnamed: 0,entityNID,entityID,entityName
0,8449,C000597569,atypical teratoid/rhabdoid tumor
1,9496,C000656824,Tinea manuum
2,9497,C000656865,Penicillium marneffei infection
3,9498,C000656904,Tinea nigra
4,9499,C000656924,scedosporiosis
...,...,...,...
3001,30536,D065634,CSF
3002,30537,D065635,BK virus
3003,30538,D065666,mesenteric infarction
3004,30540,D065886,neurodevelopmental diseases


In [95]:
dataset = []

In [116]:
for i in tqdm(range(2553, disease_info.shape[0])):
    nid = disease_info.iloc[i, 0]
    mesh = disease_info.iloc[i, 1]
    if api_key:
        #Add the api key to the requests headers of the requests Session object in order to use the restricted endpoints.
        s.headers.update({"Authorization": "Bearer %s" % api_key}) 
        gda_response = s.get(api_host+'/disease/mesh/{0}'.format(mesh))
        if(gda_response.status_code == 400):
            dataset.append([nid, mesh, ''])
        elif(gda_response.status_code == 404):
            dataset.append([nid, mesh, ''])
        else:
            disease_id = gda_response.json()[0]['diseaseid']
            dataset.append([nid, mesh, disease_id])

100%|████████████████████████████████████████████████████████████████████████████████| 453/453 [04:38<00:00,  1.63it/s]


In [119]:
disease_mapping = pd.DataFrame(dataset, columns=["entityNID", "mesh_id", "disease_id"])

In [31]:
disease_mapping

Unnamed: 0,entityNID,mesh_id,disease_id
0,8449,C000597569,
1,9496,C000656824,
2,9497,C000656865,
3,9498,C000656904,
4,9499,C000656924,
...,...,...,...
3001,30536,D065634,
3002,30537,D065635,C0155502
3003,30538,D065666,C0267412
3004,30540,D065886,C1535926


## p-BKG Construction

- You must construct database for FDA Orange Book Database USPTO Patents 
- For FDA Orange Book data, you can visit https://www.fda.gov/drugs/drug-approvals-and-databases/orange-book-data-files
- For USPTO Patents data, you can visit https://patentsview.org/download/data-download-tables

In [89]:
cursor.execute('''SELECT *
FROM db_name.fda_patent;''')
data = cursor.fetchall()

In [90]:
fda_patents = pd.DataFrame(data)

In [91]:
fda_patents

Unnamed: 0,id,Patent_No,Appl_No,Product_No,Drug_Substance_Flag,Drug_Product_Flag,Delist_Flag,Patent_Expire_Date,Submission_Date
0,13217_3,7122566,13217,3,,,,2026-02-06,
1,17697_1,6803046,17697,1,,Y,,2022-08-16,
2,18207_1,8133893,18207,1,Y,Y,,2029-03-13,2015-07-20
3,18207_2,8133893,18207,2,Y,Y,,2029-03-13,2015-07-20
4,18207_3,8133893,18207,3,Y,Y,,2029-03-13,2015-07-20
...,...,...,...,...,...,...,...,...,...
12268,50819_2,10137142,50819,2,,Y,,2029-06-03,2018-11-28
12269,50819_2,10220049,50819,2,,Y,,2029-06-03,2019-03-08
12270,50819_2,8288434,50819,2,,Y,,2029-08-05,2014-12-03
12271,50819_2,9504704,50819,2,,Y,,2029-06-03,2016-12-01


In [92]:
cursor.execute('''SELECT distinct c.reg_num, c.ipc4
FROM uspto_db_name.ipcr c
inner JOIN db_name.fda_patent_filtered f
ON c.reg_num = f.Patent_No;''')
data = cursor.fetchall()

In [93]:
pat_ipc = pd.DataFrame(data)

In [94]:
pat_ipc.columns = ['Patent_No', 'ipc4']

In [95]:
pat_ipc

Unnamed: 0,Patent_No,ipc4
0,7122566,A61K
1,6803046,A61K
2,8133893,A61K
3,8133893,C07D
4,8323683,A24B
...,...,...
8369,10137142,A61Q
8370,9504704,A61K
8371,9504704,A61Q
8372,9561208,A61K


In [96]:
import pandas as pd

# Merge the two dataframes on 'reg_num' column
merged_df = pd.merge(fda_patents, pat_ipc, on='Patent_No')

# Create one-hot encoding vectors
one_hot_df = pd.get_dummies(merged_df['ipc4'])

# Concatenate the reg_num column and the one-hot encoding vectors
result_df = pd.concat([merged_df['Patent_No'], one_hot_df], axis=1)

# Group the rows by 'reg_num' and sum the one-hot encoding vectors
result_df = result_df.groupby('Patent_No').sum()

# Reset the index to get 'reg_num' as a column
result_df.reset_index(inplace=True)

In [97]:
result_df

Unnamed: 0,Patent_No,A01K,A01N,A01P,A06K,A23C,A23G,A23L,A23P,A24B,...,G16Z,G21G,H01B,H01M,H01Q,H02H,H04B,H04L,H04N,H04Q
0,10004700,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10004717,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10004729,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10004743,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10004746,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4840,9994575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4841,9994851,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4842,9999593,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4843,9999608,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
merged_df = fda_patents.merge(result_df, on='Patent_No', how='left')
merged_df = merged_df.fillna(0)

In [99]:
merged_df

Unnamed: 0,id,Patent_No,Appl_No,Product_No,Drug_Substance_Flag,Drug_Product_Flag,Delist_Flag,Patent_Expire_Date,Submission_Date,A01K,...,G16Z,G21G,H01B,H01M,H01Q,H02H,H04B,H04L,H04N,H04Q
0,13217_3,7122566,13217,3,0,0,0,2026-02-06,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,17697_1,6803046,17697,1,0,Y,0,2022-08-16,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18207_1,8133893,18207,1,Y,Y,0,2029-03-13,2015-07-20,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18207_2,8133893,18207,2,Y,Y,0,2029-03-13,2015-07-20,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18207_3,8133893,18207,3,Y,Y,0,2029-03-13,2015-07-20,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12268,50819_2,10137142,50819,2,0,Y,0,2029-06-03,2018-11-28,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12269,50819_2,10220049,50819,2,0,Y,0,2029-06-03,2019-03-08,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12270,50819_2,8288434,50819,2,0,Y,0,2029-08-05,2014-12-03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12271,50819_2,9504704,50819,2,0,Y,0,2029-06-03,2016-12-01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
cursor.execute('''SELECT *
FROM dgidb_drugs dd
INNER JOIN fda_drugs d
ON dd.drug_claim_name = d.Ingredient
UNION
SELECT *
FROM dgidb_drugs dd
INNER JOIN fda_drugs d
ON dd.drug_name = d.Ingredient
UNION
SELECT *
FROM dgidb_drugs dd
INNER JOIN fda_drugs d
ON dd.drug_name = d.Trade_Name
UNION
SELECT *
FROM dgidb_drugs dd
INNER JOIN fda_drugs d
ON dd.drug_claim_name = d.Trade_Name;''')
data = cursor.fetchall()

In [101]:
fda_drugs = pd.DataFrame(data)

In [102]:
fda_drugs

Unnamed: 0,concept_id,drug_claim_name,drug_name,id,Appl_No,Product_No,Applicant_Full_Name,Applicant,Trade_Name,Ingredient,Strength,NDA_Type,TE_Code,Approval_Date,RLD,RS,Type,Dosage_Form,Route
0,CHEMBL1201646,Inulin,INULIN,2282_1,2282,1,ISO TEX DIAGNOSTICS INC,ISO TEX,INULIN AND SODIUM CHLORIDE,INULIN,100MG/ML,N,,,No,No,DISCN,INJECTABLE,INJECTION
1,CHEMBL1536,ERGOCALCIFEROL,ERGOCALCIFEROL,3444_1,3444,1,VALIDUS PHARMACEUTICALS LLC,VALIDUS PHARMS,DRISDOL,ERGOCALCIFEROL,"50,000 IU",N,AA,,Yes,Yes,RX,CAPSULE,ORAL
2,CHEMBL1405,ESTRONE,ESTRONE,3977_1,3977,1,PARKEDALE PHARMACEUTICALS INC,PARKEDALE,THEELIN,ESTRONE,1MG/ML,N,,,No,No,DISCN,INJECTABLE,INJECTION
3,CHEMBL1405,ESTRONE,ESTRONE,3977_2,3977,2,PARKEDALE PHARMACEUTICALS INC,PARKEDALE,THEELIN,ESTRONE,2MG/ML,N,,,No,No,DISCN,INJECTABLE,INJECTION
4,CHEMBL1405,ESTRONE,ESTRONE,3977_3,3977,3,PARKEDALE PHARMACEUTICALS INC,PARKEDALE,THEELIN,ESTRONE,5MG/ML,N,,,No,No,DISCN,INJECTABLE,INJECTION
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33458,CHEMBL719,Mupirocin,MUPIROCIN,213053_1,213053,1,ALEOR DERMACEUTICALS LTD,ALEOR DERMACEUTICALS,MUPIROCIN,MUPIROCIN CALCIUM,EQ 2% BASE,A,AB,2021-11-16,No,No,RX,CREAM,TOPICAL
33459,CHEMBL719,Mupirocin,MUPIROCIN,213076_1,213076,1,ENCUBE ETHICALS PRIVATE LTD,ENCUBE,MUPIROCIN,MUPIROCIN CALCIUM,EQ 2% BASE,A,AB,2021-08-31,No,No,RX,CREAM,TOPICAL
33460,CHEMBL981,CHEMBL981,FENOFIBRIC ACID,213450_1,213450,1,MICRO LABS LTD,MICRO LABS,FENOFIBRIC ACID,CHOLINE FENOFIBRATE,EQ 45MG FENOFIBRIC ACID,A,AB,2020-06-16,No,No,RX,"CAPSULE, DELAYED RELEASE",ORAL
33461,CHEMBL981,CHEMBL981,FENOFIBRIC ACID,213450_2,213450,2,MICRO LABS LTD,MICRO LABS,FENOFIBRIC ACID,CHOLINE FENOFIBRATE,EQ 135MG FENOFIBRIC ACID,A,AB,2020-06-16,No,No,RX,"CAPSULE, DELAYED RELEASE",ORAL


In [103]:
drug_patent_asso = fda_drugs.merge(fda_patents, on='id').loc[:,['concept_id', 'Patent_No']].groupby(['concept_id', 'Patent_No']).size().reset_index(name='count')

In [111]:
drug_patent_asso

Unnamed: 0,drug_concept_id,Patent_No,count
0,CHEMBL1009,7182961,1
1,CHEMBL1009,7384649,1
2,CHEMBL1009,8404276,1
3,CHEMBL1009,8545878,1
4,CHEMBL1009,8586093,1
...,...,...,...
3570,CHEMBL981,7741373,2
3571,CHEMBL981,7741374,2
3572,CHEMBL981,7915247,2
3573,CHEMBL989,8871241,2


In [105]:
drug_patent_asso.columns = ['drug_concept_id', 'Patent_No', 'count']

In [112]:
drug_patent_asso

Unnamed: 0,drug_concept_id,Patent_No,count
0,CHEMBL1009,7182961,1
1,CHEMBL1009,7384649,1
2,CHEMBL1009,8404276,1
3,CHEMBL1009,8545878,1
4,CHEMBL1009,8586093,1
...,...,...,...
3570,CHEMBL981,7741373,2
3571,CHEMBL981,7741374,2
3572,CHEMBL981,7915247,2
3573,CHEMBL989,8871241,2


In [107]:
cursor.execute('''SELECT a.reg_num, f.id, f.Trade_Name,a.count
FROM db_name i
INNER JOIN entity_abstract_filtered a
ON a.entityNID = i.entityNID
INNER JOIN fda_drugs f
ON i.entityName = f.Trade_Name
WHERE i.entityType = 'drug'
ORDER BY a.reg_num;''')
data = cursor.fetchall()

In [108]:
drug_pat_co = pd.DataFrame(data)

In [109]:
drug_pat_co

Unnamed: 0,reg_num,id,Trade_Name,count
0,10004717,40568_1,GLYCOPYRROLATE,3
1,10004717,40568_2,GLYCOPYRROLATE,3
2,10004717,40653_1,GLYCOPYRROLATE,3
3,10004717,40653_2,GLYCOPYRROLATE,3
4,10004717,40821_1,GLYCOPYRROLATE,3
...,...,...,...,...
44311,RE47301,203883_2,ADENOSINE,1
44312,RE47301,205331_1,ADENOSINE,1
44313,RE47301,205331_2,ADENOSINE,1
44314,RE47301,205568_1,ADENOSINE,1


In [113]:
fda_patents

Unnamed: 0,id,Patent_No,Appl_No,Product_No,Drug_Substance_Flag,Drug_Product_Flag,Delist_Flag,Patent_Expire_Date,Submission_Date
0,13217_3,7122566,13217,3,,,,2026-02-06,
1,17697_1,6803046,17697,1,,Y,,2022-08-16,
2,18207_1,8133893,18207,1,Y,Y,,2029-03-13,2015-07-20
3,18207_2,8133893,18207,2,Y,Y,,2029-03-13,2015-07-20
4,18207_3,8133893,18207,3,Y,Y,,2029-03-13,2015-07-20
...,...,...,...,...,...,...,...,...,...
12268,50819_2,10137142,50819,2,,Y,,2029-06-03,2018-11-28
12269,50819_2,10220049,50819,2,,Y,,2029-06-03,2019-03-08
12270,50819_2,8288434,50819,2,,Y,,2029-08-05,2014-12-03
12271,50819_2,9504704,50819,2,,Y,,2029-06-03,2016-12-01


In [114]:
drug_pat_co

Unnamed: 0,reg_num,id,Trade_Name,count
0,10004717,40568_1,GLYCOPYRROLATE,3
1,10004717,40568_2,GLYCOPYRROLATE,3
2,10004717,40653_1,GLYCOPYRROLATE,3
3,10004717,40653_2,GLYCOPYRROLATE,3
4,10004717,40821_1,GLYCOPYRROLATE,3
...,...,...,...,...
44311,RE47301,203883_2,ADENOSINE,1
44312,RE47301,205331_1,ADENOSINE,1
44313,RE47301,205331_2,ADENOSINE,1
44314,RE47301,205568_1,ADENOSINE,1


In [115]:
drug_pat_co.merge(fda_drugs, on='id')

Unnamed: 0,reg_num,id,Trade_Name_x,count,concept_id,drug_claim_name,drug_name,Appl_No,Product_No,Applicant_Full_Name,...,Ingredient,Strength,NDA_Type,TE_Code,Approval_Date,RLD,RS,Type,Dosage_Form,Route
0,10004717,40568_1,GLYCOPYRROLATE,3,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,40568,1,RENATA LTD,...,GLYCOPYRROLATE,1MG,A,,2004-12-22,No,No,DISCN,TABLET,ORAL
1,10052267,40568_1,GLYCOPYRROLATE,1,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,40568,1,RENATA LTD,...,GLYCOPYRROLATE,1MG,A,,2004-12-22,No,No,DISCN,TABLET,ORAL
2,10543192,40568_1,GLYCOPYRROLATE,3,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,40568,1,RENATA LTD,...,GLYCOPYRROLATE,1MG,A,,2004-12-22,No,No,DISCN,TABLET,ORAL
3,10548875,40568_1,GLYCOPYRROLATE,3,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,40568,1,RENATA LTD,...,GLYCOPYRROLATE,1MG,A,,2004-12-22,No,No,DISCN,TABLET,ORAL
4,7638552,40568_1,GLYCOPYRROLATE,4,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,40568,1,RENATA LTD,...,GLYCOPYRROLATE,1MG,A,,2004-12-22,No,No,DISCN,TABLET,ORAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44602,RE44186,206483_2,GLYBURIDE,1,CHEMBL472,glibenclamide,GLYBURIDE,206483,2,ORIENT PHARMA CO LTD,...,GLYBURIDE,2.5MG,A,AB1,2019-02-22,No,No,RX,TABLET,ORAL
44603,RE44186,206483_3,GLYBURIDE,1,CHEMBL472,glibenclamide,GLYBURIDE,206483,3,ORIENT PHARMA CO LTD,...,GLYBURIDE,5MG,A,AB1,2019-02-22,No,No,RX,TABLET,ORAL
44604,RE44186,206749_1,GLYBURIDE,1,CHEMBL472,glibenclamide,GLYBURIDE,206749,1,ZYDUS PHARMACEUTICALS USA INC,...,GLYBURIDE,1.25mg,A,AB1,2016-05-10,No,No,RX,TABLET,ORAL
44605,RE44186,206749_2,GLYBURIDE,1,CHEMBL472,glibenclamide,GLYBURIDE,206749,2,ZYDUS PHARMACEUTICALS USA INC,...,GLYBURIDE,2.5mg,A,AB1,2016-05-10,No,No,RX,TABLET,ORAL


In [82]:
drug_pat_co.merge(fda_drugs, on='id').loc[:,['concept_id', 'Trade_Name_x', 'drug_claim_name', 'reg_num', 'count']]

Unnamed: 0,concept_id,Trade_Name_x,drug_claim_name,reg_num,count
0,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,10004717,3
1,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,10052267,1
2,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,10543192,3
3,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,10548875,3
4,CHEMBL1201027,GLYCOPYRROLATE,GLYCOPYRROLATE,7638552,4
...,...,...,...,...,...
44602,CHEMBL472,GLYBURIDE,glibenclamide,RE44186,1
44603,CHEMBL472,GLYBURIDE,glibenclamide,RE44186,1
44604,CHEMBL472,GLYBURIDE,glibenclamide,RE44186,1
44605,CHEMBL472,GLYBURIDE,glibenclamide,RE44186,1


In [116]:
drug_patent_co = drug_pat_co.merge(fda_drugs, on='id').loc[:,['concept_id', 'reg_num', 'count']]

In [117]:
drug_patent_co.columns = ['drug_concept_id', 'Patent_No', 'count']

In [118]:
cursor.execute('''SELECT *
FROM tmp_jegal_drugs.fda_patent_filtered f''')
data = cursor.fetchall()

In [119]:
orange_book = pd.DataFrame(data)

In [120]:
orange_book

Unnamed: 0,id,Patent_No,Appl_No,Product_No,Drug_Substance_Flag,Drug_Product_Flag,Delist_Flag,Patent_Expire_Date,Submission_Date
0,13217_3,7122566,13217,3,,,,2026-02-06,
1,17697_1,6803046,17697,1,,Y,,2022-08-16,
2,18207_1,8133893,18207,1,Y,Y,,2029-03-13,2015-07-20
3,18207_2,8133893,18207,2,Y,Y,,2029-03-13,2015-07-20
4,18207_3,8133893,18207,3,Y,Y,,2029-03-13,2015-07-20
...,...,...,...,...,...,...,...,...,...
12268,50819_2,10137142,50819,2,,Y,,2029-06-03,2018-11-28
12269,50819_2,10220049,50819,2,,Y,,2029-06-03,2019-03-08
12270,50819_2,8288434,50819,2,,Y,,2029-08-05,2014-12-03
12271,50819_2,9504704,50819,2,,Y,,2029-06-03,2016-12-01


In [122]:
cursor.execute('''SELECT * FROM db_name.fda_company''')
data = cursor.fetchall()

In [123]:
fda_company = pd.DataFrame(data)

In [124]:
fda_company

Unnamed: 0,Applicant
0,3D IMAGING DRUG
1,3M
2,3M HEALTH CARE
3,60 DEGREES PHARMS
4,AAA USA INC
...,...
1786,ZYDUS NOVELTECH INC
1787,ZYDUS PHARMS
1788,ZYDUS PHARMS USA
1789,ZYDUS PHARMS USA INC


In [125]:
cursor.execute('''SELECT a.*, i.mesh_id, i.disease_id FROM entity_abstract_filtered a
INNER JOIN entity_disease_mapping i
ON a.entityNID = i.entityNID
WHERE LENGTH(i.disease_id) > 1;''')
data = cursor.fetchall()

In [126]:
disease_abst = pd.DataFrame(data)

In [127]:
disease_abst['disease_id'] = disease_abst['disease_id'].str.rstrip()

In [128]:
disease_abst

Unnamed: 0,reg_num,entityNID,count,mesh_id,disease_id
0,8062652,22900,1,C535343,C1854311
1,8110225,22931,1,C535698,C2930984
2,8604072,22931,1,C535698,C2930984
3,8685460,22931,1,C535698,C2930984
4,9884044,22931,1,C535698,C2930984
...,...,...,...,...,...
3827,7858605,30533,1,D065446,C0520676
3828,8829005,30535,1,D065631,C2607914
3829,9254286,30535,1,D065631,C2607914
3830,9750684,30535,1,D065631,C2607914


In [95]:
disease_abst[disease_abst['mesh_id'] == 'D000544']

Unnamed: 0,reg_num,entityNID,count,mesh_id,disease_id
155,10118881,25465,1,D000544,C0002395
156,10300025,25465,1,D000544,C0002395
157,10307379,25465,1,D000544,C0002395
158,10941095,25465,1,D000544,C0002395
159,11103463,25465,1,D000544,C0002395
160,7270800,25465,2,D000544,C0002395
161,7335799,25465,1,D000544,C0002395
162,7351401,25465,2,D000544,C0002395
163,7511041,25465,1,D000544,C0002395
164,8058291,25465,1,D000544,C0002395


In [129]:
cursor.execute('''SELECT a.*, substring(i.entityID, 10) entityID, i.entityName, i.entityType FROM entity_abstract_filtered a
INNER JOIN entity_info i
ON a.entityNID = i.entityNID
WHERE i.entityType = 'target' AND i.entityID LIKE 'NCBIGene%';''')
data = cursor.fetchall()

In [130]:
gene_abst = pd.DataFrame(data)

In [131]:
gene_abst

Unnamed: 0,reg_num,entityNID,count,entityID,entityName,entityType
0,10004746,34373,1,695,Bruton's tyrosine kinase,target
1,10005761,34298,1,673,B-RAF,target
2,10006924,31568,2,1392,CRH,target
3,10006924,33592,3,5443,ACTH,target
4,10010507,34373,3,695,Bruton's tyrosine kinase,target
...,...,...,...,...,...,...
1399,RE48608,32328,2,2717,-galactosidase A,target
1400,RE48825,32684,1,3356,serotonin-2,target
1401,RE48839,31784,1,1813,(B) 2 D-Har,target
1402,RE48839,32684,1,3356,serotonin-2,target


In [132]:
cursor.execute('''SELECT DISTINCT p.Patent_No, p.Applicant
FROM fda_drug_patent p;''')
data = cursor.fetchall()

In [133]:
patent_company_info = pd.DataFrame(data)

In [134]:
patent_company_info

Unnamed: 0,Patent_No,Applicant
0,8623935,3M HEALTH CARE
1,10342791,60 DEGREES PHARMS
2,10888558,60 DEGREES PHARMS
3,10596276,AAA USA INC
4,10596278,AAA USA INC
...,...,...
5188,9180095,ZYLA
5189,9180096,ZYLA
5190,9186328,ZYLA
5191,9492443,ZYLA


In [135]:
orange_book

Unnamed: 0,id,Patent_No,Appl_No,Product_No,Drug_Substance_Flag,Drug_Product_Flag,Delist_Flag,Patent_Expire_Date,Submission_Date
0,13217_3,7122566,13217,3,,,,2026-02-06,
1,17697_1,6803046,17697,1,,Y,,2022-08-16,
2,18207_1,8133893,18207,1,Y,Y,,2029-03-13,2015-07-20
3,18207_2,8133893,18207,2,Y,Y,,2029-03-13,2015-07-20
4,18207_3,8133893,18207,3,Y,Y,,2029-03-13,2015-07-20
...,...,...,...,...,...,...,...,...,...
12268,50819_2,10137142,50819,2,,Y,,2029-06-03,2018-11-28
12269,50819_2,10220049,50819,2,,Y,,2029-06-03,2019-03-08
12270,50819_2,8288434,50819,2,,Y,,2029-08-05,2014-12-03
12271,50819_2,9504704,50819,2,,Y,,2029-06-03,2016-12-01


In [103]:
fda_drugs

Unnamed: 0,concept_id,drug_claim_name,drug_name,id,Appl_No,Product_No,Applicant_Full_Name,Applicant,Trade_Name,Ingredient,Strength,NDA_Type,TE_Code,Approval_Date,RLD,RS,Type,Dosage_Form,Route
0,CHEMBL1201646,Inulin,INULIN,2282_1,2282,1,ISO TEX DIAGNOSTICS INC,ISO TEX,INULIN AND SODIUM CHLORIDE,INULIN,100MG/ML,N,,,No,No,DISCN,INJECTABLE,INJECTION
1,CHEMBL1536,ERGOCALCIFEROL,ERGOCALCIFEROL,3444_1,3444,1,VALIDUS PHARMACEUTICALS LLC,VALIDUS PHARMS,DRISDOL,ERGOCALCIFEROL,"50,000 IU",N,AA,,Yes,Yes,RX,CAPSULE,ORAL
2,CHEMBL1405,ESTRONE,ESTRONE,3977_1,3977,1,PARKEDALE PHARMACEUTICALS INC,PARKEDALE,THEELIN,ESTRONE,1MG/ML,N,,,No,No,DISCN,INJECTABLE,INJECTION
3,CHEMBL1405,ESTRONE,ESTRONE,3977_2,3977,2,PARKEDALE PHARMACEUTICALS INC,PARKEDALE,THEELIN,ESTRONE,2MG/ML,N,,,No,No,DISCN,INJECTABLE,INJECTION
4,CHEMBL1405,ESTRONE,ESTRONE,3977_3,3977,3,PARKEDALE PHARMACEUTICALS INC,PARKEDALE,THEELIN,ESTRONE,5MG/ML,N,,,No,No,DISCN,INJECTABLE,INJECTION
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33458,CHEMBL719,Mupirocin,MUPIROCIN,213053_1,213053,1,ALEOR DERMACEUTICALS LTD,ALEOR DERMACEUTICALS,MUPIROCIN,MUPIROCIN CALCIUM,EQ 2% BASE,A,AB,2021-11-16,No,No,RX,CREAM,TOPICAL
33459,CHEMBL719,Mupirocin,MUPIROCIN,213076_1,213076,1,ENCUBE ETHICALS PRIVATE LTD,ENCUBE,MUPIROCIN,MUPIROCIN CALCIUM,EQ 2% BASE,A,AB,2021-08-31,No,No,RX,CREAM,TOPICAL
33460,CHEMBL981,CHEMBL981,FENOFIBRIC ACID,213450_1,213450,1,MICRO LABS LTD,MICRO LABS,FENOFIBRIC ACID,CHOLINE FENOFIBRATE,EQ 45MG FENOFIBRIC ACID,A,AB,2020-06-16,No,No,RX,"CAPSULE, DELAYED RELEASE",ORAL
33461,CHEMBL981,CHEMBL981,FENOFIBRIC ACID,213450_2,213450,2,MICRO LABS LTD,MICRO LABS,FENOFIBRIC ACID,CHOLINE FENOFIBRATE,EQ 135MG FENOFIBRIC ACID,A,AB,2020-06-16,No,No,RX,"CAPSULE, DELAYED RELEASE",ORAL


In [136]:
fda_drug_list = fda_drugs.iloc[:,[0, 1]].drop_duplicates()

In [137]:
fda_drug_list

Unnamed: 0,concept_id,drug_claim_name
0,CHEMBL1201646,Inulin
1,CHEMBL1536,ERGOCALCIFEROL
2,CHEMBL1405,ESTRONE
5,CHEMBL1466,Dicumarol
10,CHEMBL1200517,DIHYDROERGOTAMINE MESYLATE
...,...,...
33410,CHEMBL84,TOPOTECAN
33431,CHEMBL490,Paroxetine
33435,CHEMBL86,metoclopramide
33450,CHEMBL457547,MICAFUNGIN


In [138]:
drug_patent_co

Unnamed: 0,drug_concept_id,Patent_No,count
0,CHEMBL1201027,10004717,3
1,CHEMBL1201027,10052267,1
2,CHEMBL1201027,10543192,3
3,CHEMBL1201027,10548875,3
4,CHEMBL1201027,7638552,4
...,...,...,...
44602,CHEMBL472,RE44186,1
44603,CHEMBL472,RE44186,1
44604,CHEMBL472,RE44186,1
44605,CHEMBL472,RE44186,1


In [99]:
# Create an empty graph
P = nx.Graph()

# Add gene nodes from dgidb_gene and disgenet_gene
for i, row in disgenet_gene.iterrows():
    gene_id = row['geneId']
    if gene_id not in P:
        P.add_node(gene_id, type='gene', **row.to_dict())
for i, row in dgidb_gene.iterrows():
    gene_id = row['entrez_id'] # geneId, entrez_id same index
    if gene_id not in P:
        P.add_node(gene_id, type='gene', **row.to_dict())

# Add disease nodes from disgenet_disease
for i, row in disgenet_disease_m.iterrows():
    disease_id = row['diseaseId']
    if disease_id not in P:
        P.add_node(disease_id, type='disease', **row.to_dict())

# Add drug nodes from dgidb_drug & fda_drug_list
for i, row in dgidb_drug.iterrows():
    drug_id = row['concept_id']
    if drug_id not in P:
        P.add_node(drug_id, type='drug', **row.to_dict())
for i, row in fda_drug_list.iterrows():
    drug_id = row['concept_id']
    if drug_id not in P:
        P.add_node(drug_id, type='drug', **row.to_dict())
        
        
# Add gene-disease edges from disgenet_gene_disease
for i, row in disgenet_m.iterrows():
    gene_id = row['geneId']
    disease_id = row['diseaseId']
    asso_type = row['associationType']
    if gene_id in P and disease_id in P:
        P.add_edge(gene_id, disease_id, type=asso_type, **row.to_dict())

# Add gene-drug edges from dgidb_gene_drug
for i, row in dgidb.iterrows():
    gene_id = row['entrez_id']
    drug_id = row['drug_concept_id']
    interaction_type = row['interaction_types']
    if gene_id in P and drug_id in P:
        P.add_edge(gene_id, drug_id, type='dgidb_'+interaction_type, **row.to_dict())
        
# Add company nodes from fda_company
for i, row in fda_company.iterrows():
    company = row['Applicant']
    if company not in P:
        P.add_node(company, type='company')
        
# Add patent nodes from fda_patents
for i, row in result_df.iterrows():
    Patent_No = row['Patent_No']
    if Patent_No not in P:
        P.add_node(Patent_No, type='patent', **row.to_dict())
        
# Add drug-patent from drug_patent_asso
for i, row in drug_patent_asso.iterrows():
    Patent_No = row['Patent_No']
    drug_id = row['drug_concept_id']
    count = row['count']
    if Patent_No in P and drug_id in P:
        P.add_edge(Patent_No, drug_id, type='FDA_ORANGE_BOOK', count=count)

        
        
# Add drug-patent edges from drug_patent_co
for i, row in drug_patent_co.iterrows():
    drug_id = row['drug_concept_id']
    Patent_No = row['Patent_No']
    count = row['count']
    if drug_id in P and Patent_No in P:
        P.add_edge(drug_id, Patent_No, type='FROM_ABSTRACT', count=count)

# Add gene-patent edges from gene_abst
for i, row in gene_abst.iterrows():
    gene_id = row['entityID']
    Patent_No = row['reg_num']
    count = row['count']
    if gene_id in P and Patent_No in P:
        P.add_edge(gene_id, Patent_No, type='FROM_ABSTRACT', count=count)
        
# Add disease-patent edges from disease_abst
for i, row in disease_abst.iterrows():
    disease_id = row['disease_id']
    Patent_No = row['reg_num']
    count = row['count']
    if disease_id in P and Patent_No in P:
        P.add_edge(disease_id, Patent_No, type='FROM_ABSTRACT', count=count)
        
# Add patent-company edges from drug_company_info
for i, row in patent_company_info.iterrows():
    Patent_No = row['Patent_No']
    company = row['Applicant']
    if company in P and drug_id in P:
        P.add_edge(company, Patent_No, type='PATENT_COMPANY')

In [86]:
with open("P.pickle","wb") as f:
    pickle.dump(P, f)

In [112]:
with open("P.pickle","rb") as f:
    P = pickle.load(f)

## Identifying candidates with technological potential with p-BKG

In [254]:
model_fda = Node2Vec(P, dimensions=15, walk_length=5, num_walks=100, workers=1)

Computing transition probabilities:   0%|          | 0/92280 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████| 100/100 [1:26:33<00:00, 51.94s/it]


In [255]:
# Train the model
model_n2v_fda = model_fda.fit(window=5)

In [115]:
embeddings_fda = {}

In [151]:
for word in P.nodes():
    embeddings_fda[word] = model_n2v_fda.wv[word]

In [146]:
# Compute similarity scores between drug nodes and disease nodes
drug_nodes_fda = [n for n,d in P.nodes(data=True) if 'type' in d and d["type"] == "drug"]
disease_nodes_fda = [n for n,d in P.nodes(data=True) if 'type' in d and d["type"] == "disease"]

In [147]:
patent_nodes_fda = [n for n,d in P.nodes(data=True) if 'type' in d and d["type"] == "patent"]

In [149]:
embeddings_fda['CHEMBL1678']

array([ 0.56665725,  1.0202807 ,  0.87059534, -0.9308079 ,  2.5168183 ,
       -0.907763  ,  3.1285372 , -0.8974309 ,  0.41505972, -0.47207978,
        3.71154   ,  0.39422137,  2.7420104 , -0.15829195, -1.2371354 ],
      dtype=float32)

In [150]:
drug_patent_co

Unnamed: 0,drug_concept_id,Patent_No,count
0,CHEMBL1201027,10004717,3
1,CHEMBL1201027,10052267,1
2,CHEMBL1201027,10543192,3
3,CHEMBL1201027,10548875,3
4,CHEMBL1201027,7638552,4
...,...,...,...
44602,CHEMBL472,RE44186,1
44603,CHEMBL472,RE44186,1
44604,CHEMBL472,RE44186,1
44605,CHEMBL472,RE44186,1


In [151]:
drug_patent_co_n = drug_patent_co[drug_patent_co['Patent_No'].str.isnumeric()]

In [97]:
fin = []

In [152]:
drug_patent_co_n

Unnamed: 0,drug_concept_id,Patent_No,count
0,CHEMBL1201027,10004717,3
1,CHEMBL1201027,10052267,1
2,CHEMBL1201027,10543192,3
3,CHEMBL1201027,10548875,3
4,CHEMBL1201027,7638552,4
...,...,...,...
44572,CHEMBL547,9750711,1
44573,CHEMBL547,9700535,1
44574,CHEMBL547,9750711,1
44575,CHEMBL547,9700535,1


In [153]:
alzheimer_pat = disease_abst[disease_abst['mesh_id'] == 'D000544']

In [154]:
alzheimer_pat['reg_num']

155    10118881
156    10300025
157    10307379
158    10941095
159    11103463
160     7270800
161     7335799
162     7351401
163     7511041
164     8058291
165     8283379
166     8293794
167     8338486
168     8362085
169     8497301
170     8598233
171     8932557
172     9000041
173     9125910
174     9624152
175     9993466
176     RE41783
Name: reg_num, dtype: object

In [124]:
drug_patent_asso

Unnamed: 0,drug_concept_id,Patent_No,count
0,CHEMBL1009,7182961,1
1,CHEMBL1009,7384649,1
2,CHEMBL1009,8404276,1
3,CHEMBL1009,8545878,1
4,CHEMBL1009,8586093,1
...,...,...,...
3570,CHEMBL981,7741373,2
3571,CHEMBL981,7741374,2
3572,CHEMBL981,7915247,2
3573,CHEMBL989,8871241,2


In [185]:
cursor.execute('''SELECT *
FROM disgenet d
WHERE d.diseaseId = 'C0002395';''')
data = cursor.fetchall()

In [186]:
alzheimer_genes = pd.DataFrame(data)

In [187]:
alz_gene_list = list(set(alzheimer_genes['geneId']))

In [167]:
pat_drug_list = ['CHEMBL1678', 'CHEMBL502', 'CHEMBL66115', 'CHEMBL807']

In [168]:
repurposing_candidate_group_patdrug = []

In [169]:
for drug_node in dr_candidates['drug_concept_id']:
    for pat_drug in pat_drug_list:
        sim = np.dot(embeddings_fda[drug_node], embeddings_fda[pat_drug])/(np.linalg.norm(embeddings_fda[drug_node])*np.linalg.norm(embeddings_fda[pat_drug]))
        repurposing_candidate_group_patdrug.append([drug_node, pat_drug, sim])
        
repurposing_candidate_group_patdrug_df = pd.DataFrame(repurposing_candidate_group_patdrug, columns=["drug_concept_id", "pat_drug", "cosine_similarity"])

In [166]:
dr_candidates

Unnamed: 0,drug_concept_id,cosine_similarity
0,CHEMBL254836,0.918763
1,CHEMBL4298172,0.864723
2,CHEMBL225411,0.821627
3,CHEMBL3527358,0.808408
4,CHEMBL4297504,0.807645
...,...,...
227,CHEMBL357076,0.701239
228,CHEMBL2311194,0.701226
229,CHEMBL185515,0.701110
230,CHEMBL1677,0.700845


In [None]:
repurposing_candidate_group_fda = []

for drug_node in dr_candidates['drug_concept_id']:
    for reg_num in alzheimer_pat[:-1]['reg_num']:
        sim = np.dot(embeddings_fda[drug_node], embeddings_fda[reg_num])/(np.linalg.norm(embeddings_fda[drug_node])*np.linalg.norm(embeddings_fda[reg_num]))
        repurposing_candidate_group_fda.append([drug_node, reg_num, sim])
        
repurposing_candidate_fda_df = pd.DataFrame(repurposing_candidate_group_fda, columns=["drug_concept_id", "reg_num", "cosine_similarity"])

# mean
repurposing_candidate_fda_df = repurposing_candidate_fda_df.groupby(['drug_concept_id']).mean().sort_values(by='cosine_similarity',ascending=False).reset_index()

In [172]:
repurposing_candidate_fda_df

Unnamed: 0,drug_concept_id,cosine_similarity
0,CHEMBL1163419,0.748771
1,CHEMBL2104538,0.744177
2,CHEMBL4297207,0.741137
3,CHEMBL1829173,0.737349
4,CHEMBL277845,0.730751
...,...,...
224,CHEMBL451930,0.450039
225,CHEMBL4068108,0.397234
226,CHEMBL1078549,-0.042330
227,CHEMBL125511,-0.151783


In [173]:
repurposing_candidate_fda_df['rank'] = repurposing_candidate_fda_df['cosine_similarity'].rank(ascending=False)

In [157]:
repurposing_candidate_fda_df

Unnamed: 0,drug_concept_id,cosine_similarity,rank
0,CHEMBL1163419,0.748771,1.0
1,CHEMBL2104538,0.744177,2.0
2,CHEMBL4297207,0.741137,3.0
3,CHEMBL1829173,0.737349,4.0
4,CHEMBL277845,0.730751,5.0
...,...,...,...
224,CHEMBL451930,0.450039,225.0
225,CHEMBL4068108,0.397234,226.0
226,CHEMBL1078549,-0.042330,227.0
227,CHEMBL125511,-0.151783,228.0


In [158]:
repurposing_candidate_fda_df['gene_count'] = 0

In [159]:
repurposing_candidate_fda_df

Unnamed: 0,drug_concept_id,cosine_similarity,rank,gene_count
0,CHEMBL1163419,0.748771,1.0,0
1,CHEMBL2104538,0.744177,2.0,0
2,CHEMBL4297207,0.741137,3.0,0
3,CHEMBL1829173,0.737349,4.0,0
4,CHEMBL277845,0.730751,5.0,0
...,...,...,...,...
224,CHEMBL451930,0.450039,225.0,0
225,CHEMBL4068108,0.397234,226.0,0
226,CHEMBL1078549,-0.042330,227.0,0
227,CHEMBL125511,-0.151783,228.0,0


In [195]:
repurposing_candidate_fda_df['drug_gene'] = 0
repurposing_candidate_fda_df['gene_count'] = 0

In [196]:
for i in range(repurposing_candidate_fda_df.shape[0]):
    try:
        chembl = repurposing_candidate_fda_df.iloc[i, 0]
        cursor.execute("SELECT d.entrez_id FROM dgidb d WHERE d.drug_concept_id = '{}';".format(chembl))
        data = pd.DataFrame(cursor.fetchall())
        drug_gene = data.shape[0]
        if drug_gene > 0:
            count = len(pd.merge(pd.DataFrame(alz_gene_list, columns=['entrez_id']), data, on='entrez_id'))
            repurposing_candidate_fda_df.loc[i,'drug_gene'] = drug_gene
            repurposing_candidate_fda_df.loc[i, 'gene_count'] = count
    except:
        print(i)
        repurposing_candidate_fda_df.iloc[i, 'drug_gene'] = 0
        repurposing_candidate_fda_df.iloc[i, 'gene_count'] = 0

In [266]:
repurposing_candidate_fda_df['gene_count_rank'] = repurposing_candidate_fda_df['gene_count'].rank(method='min', ascending=False)

In [267]:
repurposing_candidate_fda_df['drug_gene_rank'] = repurposing_candidate_fda_df['drug_gene'].rank(method='min', ascending=False)

In [289]:
repurposing_candidate_fda_df

Unnamed: 0,drug_concept_id,cosine_similarity,rank,gene_count,gene_count_rank
14,CHEMBL2218919,0.719644,15.0,3,16.0
0,CHEMBL1163419,0.748771,1.0,2,24.0
2,CHEMBL4297207,0.741137,3.0,2,24.0
18,CHEMBL1269597,0.718696,19.0,3,16.0
20,CHEMBL447664,0.716660,21.0,2,24.0
...,...,...,...,...,...
207,CHEMBL473806,0.527673,208.0,0,209.0
216,CHEMBL431210,0.504950,217.0,0,209.0
226,CHEMBL1078549,-0.042330,227.0,0,209.0
227,CHEMBL125511,-0.151783,228.0,0,209.0


In [291]:
repurposing_candidate_fda_df['final_rank'] = ((repurposing_candidate_fda_df['gene_count_rank'] + repurposing_candidate_fda_df['rank']) / 2).rank(method='min', ascending=True)

In [292]:
repurposing_candidate_fda_df

Unnamed: 0,drug_concept_id,cosine_similarity,rank,gene_count,gene_count_rank,final_rank
14,CHEMBL2218919,0.719644,15.0,3,16.0,3.0
0,CHEMBL1163419,0.748771,1.0,2,24.0,1.0
2,CHEMBL4297207,0.741137,3.0,2,24.0,2.0
18,CHEMBL1269597,0.718696,19.0,3,16.0,5.0
20,CHEMBL447664,0.716660,21.0,2,24.0,7.0
...,...,...,...,...,...,...
207,CHEMBL473806,0.527673,208.0,0,209.0,225.0
216,CHEMBL431210,0.504950,217.0,0,209.0,226.0
226,CHEMBL1078549,-0.042330,227.0,0,209.0,227.0
227,CHEMBL125511,-0.151783,228.0,0,209.0,228.0


In [None]:
repurposing_candidate_fda_df['final_rank'] = ((repurposing_candidate_fda_df['gene_set_rank'] + repurposing_candidate_fda_df['rank']) / 2).rank(method='min', ascending=True)

In [293]:
repurposing_candidate_fda_df.sort_values(by='final_rank', ascending=True, inplace=True)

In [296]:
repurposing_candidate_fda_df

Unnamed: 0,drug_concept_id,cosine_similarity,rank,gene_count,gene_count_rank,final_rank
0,CHEMBL1163419,0.748771,1.0,2,24.0,1.0
2,CHEMBL4297207,0.741137,3.0,2,24.0,2.0
14,CHEMBL2218919,0.719644,15.0,3,16.0,3.0
7,CHEMBL2106841,0.728552,8.0,2,24.0,4.0
18,CHEMBL1269597,0.718696,19.0,3,16.0,5.0
...,...,...,...,...,...,...
207,CHEMBL473806,0.527673,208.0,0,209.0,225.0
216,CHEMBL431210,0.504950,217.0,0,209.0,226.0
226,CHEMBL1078549,-0.042330,227.0,0,209.0,227.0
227,CHEMBL125511,-0.151783,228.0,0,209.0,228.0


In [None]:
cursor.execute('''SELECT p.reg_num, p.type, p.lifetime
FROM uspto p
WHERE p.reg_num IN (
SELECT distinct i.reg_num
FROM ipcr i
WHERE i.ipc4 = 'A61K' AND i.main_group <> '8'
);''')
data = cursor.fetchall()

In [None]:
pharma_pat = pd.DataFrame(data)

In [141]:
# threshold 변경시
cs2_scores_065df = []
for drug_node in drug_nodes:
    similarity_score = np.dot(embeddings[drug_node], embeddings['C0678222'])/(np.linalg.norm(embeddings[drug_node])*np.linalg.norm(embeddings['C0678222']))
    if similarity_score >= 0.65:
        cs2_scores_065df.append([drug_node, similarity_score])
# Store the similarity scores in a pandas DataFrame
cs2_scores_065df = pd.DataFrame(cs2_scores_065df, columns=["drug_concept_id", "cosine_similarity"])

# Sort the DataFrame by the similarity score in descending order
cs2_scores_065df.sort_values(by="cosine_similarity", ascending=False, inplace=True)

In [142]:
cs2_scores_065df

Unnamed: 0,drug_concept_id,cosine_similarity
17,CHEMBL1242564,0.812367
62,CHEMBL254836,0.805213
60,CHEMBL2436861,0.805201
18,CHEMBL1242745,0.803604
105,CHEMBL480328,0.794541
...,...,...
103,CHEMBL467399,0.651164
111,CHEMBL487611,0.650565
58,CHEMBL2365665,0.650259
9,CHEMBL1200983,0.650024


In [143]:
cs2_scores_065df.to_csv("cs2_bc.csv")

In [146]:
# 최종 약물재창출 후보군 DB에서 가져오는 코드
cursor.execute('''SELECT *
FROM dgidb_drugs ddd
WHERE ddd.concept_id IN (
SELECT cs.drug_concept_id
FROM cs2_bc cs
WHERE cs.cosine_similarity >= 0.65 and cs.drug_concept_id NOT IN (
SELECT csp.drug_concept_id
FROM cs2_bc csp
WHERE csp.drug_concept_id IN (
SELECT DISTINCT dgi.drug_concept_id
FROM dgidb dgi
WHERE dgi.entrez_id IN (
SELECT distinct ds.geneId
FROM disgenet ds
WHERE ds.geneId IN (
SELECT d.entrez_id
FROM dgidb d
WHERE d.drug_concept_id IN (
SELECT p.drug_concept_id
FROM cs2_bc p)
)
AND ds.diseaseNID = '10934'
))))''')
data = cursor.fetchall()

In [147]:
dr_candidates2 = pd.DataFrame(data)

In [148]:
dr_candidates2.columns = ['drug_concept_id', 'drug_claim_name', 'drug_name']

In [149]:
dr_candidates2 = dr_candidates2.merge(cs2_scores_065df)

In [150]:
dr_candidates2

Unnamed: 0,drug_concept_id,drug_claim_name,drug_name,cosine_similarity
0,CHEMBL254836,chembl:CHEMBL254836,CHEMBL254836,0.805213
1,CHEMBL480328,3'-DCTP,CHEMBL480328,0.794541
2,CHEMBL1200679,chembl:CHEMBL1200679,ZINC CHLORIDE,0.793318
3,CHEMBL78704,chembl:CHEMBL78704,CHEMBL78704,0.782484
4,CHEMBL377312,chembl:CHEMBL377312,CHEMBL377312,0.779972
5,CHEMBL486231,PSI-7409,CHEMBL486231,0.775888
6,CHEMBL497296,chembl:CHEMBL497296,NEOCYCLOCITRINOL B,0.743778
7,CHEMBL487258,chembl:CHEMBL487258,CYNARIN,0.736514
8,CHEMBL496266,24-EPI-CYCLOCITRINOL,CHEMBL496266,0.735038
9,CHEMBL1669042,chembl:CHEMBL1669042,TERPESTACIN,0.734619


In [151]:
repurposing_candidate_group_fda2 = []
for drug_node in drug_nodes_fda:
    sim = np.dot(embeddings_fda[drug_node], embeddings_fda['C0678222'])/(np.linalg.norm(embeddings_fda[drug_node])*np.linalg.norm(embeddings_fda['C0678222']))
    repurposing_candidate_group_fda2.append([drug_node, sim])
    
repurposing_candidate_fda_df2 = pd.DataFrame(repurposing_candidate_group_fda2, columns=["drug_concept_id", "cosine_similarity"])
repurposing_candidate_fda_df2.sort_values(by='cosine_similarity', ascending=True, inplace=True)

cs2_list = dr_candidates2.merge(repurposing_candidate_fda_df2, on='drug_concept_id')

In [152]:
cs2_list

Unnamed: 0,drug_concept_id,drug_claim_name,drug_name,cosine_similarity_x,cosine_similarity_y
0,CHEMBL254836,chembl:CHEMBL254836,CHEMBL254836,0.805213,0.094129
1,CHEMBL480328,3'-DCTP,CHEMBL480328,0.794541,0.737423
2,CHEMBL1200679,chembl:CHEMBL1200679,ZINC CHLORIDE,0.793318,-0.371768
3,CHEMBL78704,chembl:CHEMBL78704,CHEMBL78704,0.782484,-0.350904
4,CHEMBL377312,chembl:CHEMBL377312,CHEMBL377312,0.779972,0.304686
5,CHEMBL486231,PSI-7409,CHEMBL486231,0.775888,0.798597
6,CHEMBL497296,chembl:CHEMBL497296,NEOCYCLOCITRINOL B,0.743778,0.743788
7,CHEMBL487258,chembl:CHEMBL487258,CYNARIN,0.736514,0.793672
8,CHEMBL496266,24-EPI-CYCLOCITRINOL,CHEMBL496266,0.735038,0.751589
9,CHEMBL1669042,chembl:CHEMBL1669042,TERPESTACIN,0.734619,0.666253


In [153]:
for i, drug in cs2_list['drug_concept_id'].iteritems():
    ssum = []
    for patent in drug_patent_co_n['Patent_No']:
        sim = np.dot(embeddings_fda[drug], embeddings_fda[patent])/(np.linalg.norm(embeddings_fda[drug])*np.linalg.norm(embeddings_fda[patent]))
        ssum.append(sim)
    result = np.max(ssum)
    cs2_list.loc[i, 'pat_value'] = result

In [154]:
cs2_list

Unnamed: 0,drug_concept_id,drug_claim_name,drug_name,cosine_similarity_x,cosine_similarity_y,pat_value
0,CHEMBL254836,chembl:CHEMBL254836,CHEMBL254836,0.805213,0.094129,0.255383
1,CHEMBL480328,3'-DCTP,CHEMBL480328,0.794541,0.737423,0.757896
2,CHEMBL1200679,chembl:CHEMBL1200679,ZINC CHLORIDE,0.793318,-0.371768,0.335302
3,CHEMBL78704,chembl:CHEMBL78704,CHEMBL78704,0.782484,-0.350904,0.54941
4,CHEMBL377312,chembl:CHEMBL377312,CHEMBL377312,0.779972,0.304686,0.678883
5,CHEMBL486231,PSI-7409,CHEMBL486231,0.775888,0.798597,0.795427
6,CHEMBL497296,chembl:CHEMBL497296,NEOCYCLOCITRINOL B,0.743778,0.743788,0.849077
7,CHEMBL487258,chembl:CHEMBL487258,CYNARIN,0.736514,0.793672,0.799057
8,CHEMBL496266,24-EPI-CYCLOCITRINOL,CHEMBL496266,0.735038,0.751589,0.853536
9,CHEMBL1669042,chembl:CHEMBL1669042,TERPESTACIN,0.734619,0.666253,0.758599


In [155]:
cs2_list[cs2_list['pat_value'] < np.min(fin)]

Unnamed: 0,drug_concept_id,drug_claim_name,drug_name,cosine_similarity_x,cosine_similarity_y,pat_value
0,CHEMBL254836,chembl:CHEMBL254836,CHEMBL254836,0.805213,0.094129,0.255383
2,CHEMBL1200679,chembl:CHEMBL1200679,ZINC CHLORIDE,0.793318,-0.371768,0.335302
21,CHEMBL376095,chembl:CHEMBL376095,CHEMBL376095,0.665724,0.071698,-0.005713
23,CHEMBL426560,chembl:CHEMBL426560,CHEMBL426560,0.658578,0.091934,0.455131


In [157]:
cs3_scores_065df

Unnamed: 0,drug_concept_id,cosine_similarity
345,CHEMBL4297669,0.859709
39,CHEMBL1229,0.857766
13,CHEMBL1098427,0.839949
222,CHEMBL267373,0.832310
426,CHEMBL555196,0.821889
...,...,...
324,CHEMBL428496,0.650575
270,CHEMBL3545378,0.650526
206,CHEMBL244268,0.650491
451,CHEMBL594695,0.650090


In [158]:
cs3_scores_065df.to_csv("cs3_gb.csv")

In [160]:
# 최종 약물재창출 후보군 DB에서 가져오는 코드
cursor.execute('''SELECT *
FROM dgidb_drugs ddd
WHERE ddd.concept_id IN (
SELECT cs.drug_concept_id
FROM cs3_gb cs
WHERE cs.cosine_similarity >= 0.65 and cs.drug_concept_id NOT IN (
SELECT csp.drug_concept_id
FROM cs3_gb csp
WHERE csp.drug_concept_id IN (
SELECT DISTINCT dgi.drug_concept_id
FROM dgidb dgi
WHERE dgi.entrez_id IN (
SELECT distinct ds.geneId
FROM disgenet ds
WHERE ds.geneId IN (
SELECT d.entrez_id
FROM dgidb d
WHERE d.drug_concept_id IN (
SELECT p.drug_concept_id
FROM cs3_gb p)
)
AND ds.diseaseNID = '1193'
))))''')
data = cursor.fetchall()

In [161]:
dr_candidates3 = pd.DataFrame(data)

In [162]:
dr_candidates3

Unnamed: 0,concept_id,drug_claim_name,drug_name
0,CHEMBL1098427,chembl:CHEMBL1098427,HIPPURISTANOL
1,CHEMBL555196,chembl:CHEMBL555196,SILVESTROL
2,CHEMBL1673039,chembl:CHEMBL1673039,CHEMBL1673039
3,CHEMBL593726,chembl:CHEMBL593726,CHEMBL593726
4,CHEMBL53325,178103293,CHEMBL53325
...,...,...,...
80,CHEMBL520733,CHEMBL520733,SEMAGACESTAT
81,CHEMBL459574,(+)-ISOZONARONE,CHEMBL459574
82,CHEMBL496266,24-EPI-CYCLOCITRINOL,CHEMBL496266
83,CHEMBL119596,chembl:CHEMBL119596,2-ETHOXYETHANOL


In [163]:
dr_candidates3.columns = ['drug_concept_id', 'drug_claim_name', 'drug_name']

In [164]:
dr_candidates3 = dr_candidates3.merge(cs3_scores_065df)

In [165]:
dr_candidates3

Unnamed: 0,drug_concept_id,drug_claim_name,drug_name,cosine_similarity
0,CHEMBL1098427,chembl:CHEMBL1098427,HIPPURISTANOL,0.839949
1,CHEMBL555196,chembl:CHEMBL555196,SILVESTROL,0.821889
2,CHEMBL1673039,chembl:CHEMBL1673039,CHEMBL1673039,0.815335
3,CHEMBL593726,chembl:CHEMBL593726,CHEMBL593726,0.803804
4,CHEMBL53325,178103293,CHEMBL53325,0.793284
...,...,...,...,...
80,CHEMBL520733,CHEMBL520733,SEMAGACESTAT,0.653845
81,CHEMBL459574,(+)-ISOZONARONE,CHEMBL459574,0.653343
82,CHEMBL496266,24-EPI-CYCLOCITRINOL,CHEMBL496266,0.651436
83,CHEMBL119596,chembl:CHEMBL119596,2-ETHOXYETHANOL,0.651376


In [166]:
repurposing_candidate_group_fda3 = []
for drug_node in drug_nodes_fda:
    sim = np.dot(embeddings_fda[drug_node], embeddings_fda['C0017636'])/(np.linalg.norm(embeddings_fda[drug_node])*np.linalg.norm(embeddings_fda['C0017636']))
    repurposing_candidate_group_fda3.append([drug_node, sim])
    
repurposing_candidate_fda_df3 = pd.DataFrame(repurposing_candidate_group_fda3, columns=["drug_concept_id", "cosine_similarity"])
repurposing_candidate_fda_df3.sort_values(by='cosine_similarity', ascending=True, inplace=True)

cs3_list = dr_candidates3.merge(repurposing_candidate_fda_df3, on='drug_concept_id')

In [167]:
cs3_list

Unnamed: 0,drug_concept_id,drug_claim_name,drug_name,cosine_similarity_x,cosine_similarity_y
0,CHEMBL1098427,chembl:CHEMBL1098427,HIPPURISTANOL,0.839949,0.795967
1,CHEMBL555196,chembl:CHEMBL555196,SILVESTROL,0.821889,0.793317
2,CHEMBL1673039,chembl:CHEMBL1673039,CHEMBL1673039,0.815335,0.783939
3,CHEMBL593726,chembl:CHEMBL593726,CHEMBL593726,0.803804,0.793764
4,CHEMBL53325,178103293,CHEMBL53325,0.793284,0.745268
...,...,...,...,...,...
80,CHEMBL520733,CHEMBL520733,SEMAGACESTAT,0.653845,0.658517
81,CHEMBL459574,(+)-ISOZONARONE,CHEMBL459574,0.653343,0.589898
82,CHEMBL496266,24-EPI-CYCLOCITRINOL,CHEMBL496266,0.651436,0.554504
83,CHEMBL119596,chembl:CHEMBL119596,2-ETHOXYETHANOL,0.651376,-0.074737


In [168]:
for i, drug in cs3_list['drug_concept_id'].iteritems():
    ssum = []
    for patent in drug_patent_co_n['Patent_No']:
        sim = np.dot(embeddings_fda[drug], embeddings_fda[patent])/(np.linalg.norm(embeddings_fda[drug])*np.linalg.norm(embeddings_fda[patent]))
        ssum.append(sim)
    result = np.max(ssum)
    cs3_list.loc[i, 'pat_value'] = result

In [169]:
cs3_list

Unnamed: 0,drug_concept_id,drug_claim_name,drug_name,cosine_similarity_x,cosine_similarity_y,pat_value
0,CHEMBL1098427,chembl:CHEMBL1098427,HIPPURISTANOL,0.839949,0.795967,0.702642
1,CHEMBL555196,chembl:CHEMBL555196,SILVESTROL,0.821889,0.793317,0.715729
2,CHEMBL1673039,chembl:CHEMBL1673039,CHEMBL1673039,0.815335,0.783939,0.758408
3,CHEMBL593726,chembl:CHEMBL593726,CHEMBL593726,0.803804,0.793764,0.603478
4,CHEMBL53325,178103293,CHEMBL53325,0.793284,0.745268,0.665040
...,...,...,...,...,...,...
80,CHEMBL520733,CHEMBL520733,SEMAGACESTAT,0.653845,0.658517,0.738455
81,CHEMBL459574,(+)-ISOZONARONE,CHEMBL459574,0.653343,0.589898,0.802731
82,CHEMBL496266,24-EPI-CYCLOCITRINOL,CHEMBL496266,0.651436,0.554504,0.853536
83,CHEMBL119596,chembl:CHEMBL119596,2-ETHOXYETHANOL,0.651376,-0.074737,0.356956


In [170]:
cs3_list[cs3_list['pat_value'] < np.min(fin)]

Unnamed: 0,drug_concept_id,drug_claim_name,drug_name,cosine_similarity_x,cosine_similarity_y,pat_value
5,CHEMBL254836,chembl:CHEMBL254836,CHEMBL254836,0.776793,-0.074599,0.255383
22,CHEMBL292821,chembl:CHEMBL292821,CHEMBL292821,0.694438,0.155481,0.282414
23,CHEMBL230354,chembl:CHEMBL230354,CHEMBL230354,0.690837,0.151265,0.273134
34,CHEMBL442427,chembl:CHEMBL442427,IGERNELLIN,0.680459,-0.193759,0.255052
39,CHEMBL1255,chembl:CHEMBL1255,FORMALDEHYDE,0.676551,-0.340769,0.133999
66,CHEMBL207729,chembl:CHEMBL207729,CHEMBL207729,0.66496,0.106765,0.454373
83,CHEMBL119596,chembl:CHEMBL119596,2-ETHOXYETHANOL,0.651376,-0.074737,0.356956


In [399]:
cs3_group_fda = []

In [400]:
for drug_node in drug_nodes_fda:
    sim = np.dot(embeddings_fda[drug_node], embeddings_fda['C0017636'])/(np.linalg.norm(embeddings_fda[drug_node])*np.linalg.norm(embeddings_fda['C0017636']))
    cs3_group_fda.append([drug_node, sim])

In [401]:
# Store the similarity scores in a pandas DataFrame
cs3_group_fda = pd.DataFrame(cs3_group_fda, columns=["drug_concept_id", "cosine_similarity"])

# Sort the DataFrame by the similarity score in descending order
cs3_group_fda.sort_values(by="cosine_similarity", ascending=False, inplace=True)

In [402]:
cs3_group_fda

Unnamed: 0,drug_concept_id,cosine_similarity
1522,CHEMBL1229,0.861489
6957,CHEMBL290352,0.846586
1875,CHEMBL1242196,0.846064
11637,CHEMBL589270,0.844276
2590,CHEMBL13960,0.843326
...,...,...
9949,CHEMBL474696,-0.704699
1645,CHEMBL1232472,-0.709635
6517,CHEMBL264950,-0.716802
5792,CHEMBL228224,-0.720355


In [403]:
cs3_group_fin = cs3_scores_065df.merge(cs3_group_fin, on='drug_concept_id')

In [404]:
cs3_group_fin

Unnamed: 0,drug_concept_id,cosine_similarity,drug_claim_name,drug_name
0,CHEMBL1098427,0.829262,chembl:CHEMBL1098427,HIPPURISTANOL
1,CHEMBL1650559,0.790628,Plazomicin,PLAZOMICIN
2,CHEMBL555196,0.788305,chembl:CHEMBL555196,SILVESTROL
3,CHEMBL1673039,0.780168,chembl:CHEMBL1673039,CHEMBL1673039
4,CHEMBL464249,0.772355,chembl:CHEMBL464249,CHEMBL464249
...,...,...,...,...
63,CHEMBL294144,0.655341,chembl:CHEMBL294144,CHEMBL294144
64,CHEMBL4297270,0.654440,chembl:CHEMBL4297270,BMS-986020
65,CHEMBL1161632,0.652941,chembl:CHEMBL1161632,BICARBONATE
66,CHEMBL3414626,0.651929,EPZ-5676,PINOMETOSTAT


In [405]:
cs3_fin_df = cs3_group_fin.merge(cs3_group_fda, on='drug_concept_id')

In [406]:
cs3_fin_df

Unnamed: 0,drug_concept_id,cosine_similarity_x,drug_claim_name,drug_name,cosine_similarity_y
0,CHEMBL1098427,0.829262,chembl:CHEMBL1098427,HIPPURISTANOL,0.843118
1,CHEMBL1650559,0.790628,Plazomicin,PLAZOMICIN,0.734540
2,CHEMBL555196,0.788305,chembl:CHEMBL555196,SILVESTROL,0.836165
3,CHEMBL1673039,0.780168,chembl:CHEMBL1673039,CHEMBL1673039,0.789529
4,CHEMBL464249,0.772355,chembl:CHEMBL464249,CHEMBL464249,0.762185
...,...,...,...,...,...
63,CHEMBL294144,0.655341,chembl:CHEMBL294144,CHEMBL294144,0.120808
64,CHEMBL4297270,0.654440,chembl:CHEMBL4297270,BMS-986020,0.720032
65,CHEMBL1161632,0.652941,chembl:CHEMBL1161632,BICARBONATE,-0.150117
66,CHEMBL3414626,0.651929,EPZ-5676,PINOMETOSTAT,0.703469


In [407]:
cs3_fin_df['gap'] = cs3_fin_df['cosine_similarity_y'] - cs3_fin_df['cosine_similarity_x']

In [408]:
cs3_fin_df

Unnamed: 0,drug_concept_id,cosine_similarity_x,drug_claim_name,drug_name,cosine_similarity_y,gap
0,CHEMBL1098427,0.829262,chembl:CHEMBL1098427,HIPPURISTANOL,0.843118,0.013856
1,CHEMBL1650559,0.790628,Plazomicin,PLAZOMICIN,0.734540,-0.056088
2,CHEMBL555196,0.788305,chembl:CHEMBL555196,SILVESTROL,0.836165,0.047860
3,CHEMBL1673039,0.780168,chembl:CHEMBL1673039,CHEMBL1673039,0.789529,0.009361
4,CHEMBL464249,0.772355,chembl:CHEMBL464249,CHEMBL464249,0.762185,-0.010170
...,...,...,...,...,...,...
63,CHEMBL294144,0.655341,chembl:CHEMBL294144,CHEMBL294144,0.120808,-0.534534
64,CHEMBL4297270,0.654440,chembl:CHEMBL4297270,BMS-986020,0.720032,0.065592
65,CHEMBL1161632,0.652941,chembl:CHEMBL1161632,BICARBONATE,-0.150117,-0.803057
66,CHEMBL3414626,0.651929,EPZ-5676,PINOMETOSTAT,0.703469,0.051540


In [409]:
cs3_fin_df.to_csv("cs3_fin_df.csv")

In [410]:
cs3_pat = drug_patent_asso.merge(cs3_fin_df, on='drug_concept_id')

In [411]:
cs3_pat

Unnamed: 0,drug_concept_id,Patent_No,count,cosine_similarity_x,drug_claim_name,drug_name,cosine_similarity_y,gap
0,CHEMBL3545188,7473761,5,0.668063,chembl:CHEMBL3545188,PASIREOTIDE PAMOATE,0.504702,-0.163361
1,CHEMBL3545188,7759308,5,0.668063,chembl:CHEMBL3545188,PASIREOTIDE PAMOATE,0.504702,-0.163361
2,CHEMBL3545188,8822637,3,0.668063,chembl:CHEMBL3545188,PASIREOTIDE PAMOATE,0.504702,-0.163361
3,CHEMBL3545188,9351923,5,0.668063,chembl:CHEMBL3545188,PASIREOTIDE PAMOATE,0.504702,-0.163361
4,CHEMBL43452,10555939,4,0.683394,POMALIDOMIDE,POMALIDOMIDE,0.64595,-0.037444
5,CHEMBL43452,8198262,4,0.683394,POMALIDOMIDE,POMALIDOMIDE,0.64595,-0.037444
6,CHEMBL43452,8673939,4,0.683394,POMALIDOMIDE,POMALIDOMIDE,0.64595,-0.037444
7,CHEMBL43452,8735428,4,0.683394,POMALIDOMIDE,POMALIDOMIDE,0.64595,-0.037444
8,CHEMBL43452,8828427,4,0.683394,POMALIDOMIDE,POMALIDOMIDE,0.64595,-0.037444
9,CHEMBL43452,9993467,4,0.683394,POMALIDOMIDE,POMALIDOMIDE,0.64595,-0.037444
