In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import LabelEncoder
import pickle

Initialize variables

In [2]:
data_path='data/' 
exp_id='v0'
codes={}

## Drug/target interactions

In [3]:
# Load drugs
chemicals = pd.read_csv(data_path+'CTD/chemicals.csv')
chemicals.dropna(subset=["Direct Evidence", "Inference Network"], inplace=True)
chemicals["Gene Symbol"] = chemicals["Inference Network"].str.split("|")
chemicals = chemicals.explode("Gene Symbol")
chemicals

# Load genes
gene = pd.read_csv(data_path+"CTD/genes.csv")

In [4]:
# Generate codes
codes['gene_symbol2id']={row['Gene Symbol'].upper():row['Gene ID'] for _, row in gene[['Gene Symbol','Gene ID']].drop_duplicates().iterrows()}
codes['gene_id2symbol']={row['Gene ID']:row['Gene Symbol'].upper() for _, row in gene[['Gene ID','Gene Symbol']].drop_duplicates().iterrows()}

In [5]:
# Generate gene/drug interactions
gene_drug = chemicals.merge(gene[["Gene Symbol", "Gene ID"]], how="left")
gene_drug = gene_drug[['Gene ID', 'Chemical Name']]
gene_drug['Chemical Name'] = gene_drug['Chemical Name'].str.upper()
gene_drug['Gene ID'] = 'gene_' + gene_drug['Gene ID'].astype(int).astype("string")
gene_drug['Chemical Name'] = 'drug_' + gene_drug['Chemical Name']

gene_drug.drop_duplicates(inplace=True)

## SARS-CoV-2 and human protein interactions

From [Gorden et al. Nature 2020](https://www.nature.com/articles/s41586-020-2286-9#Sec36)
Pulled from Supplementary Table 5

In [6]:
# Pull the data
baits_gene = pd.read_csv(data_path+'biology-database/baits-prey-mist.csv')

# only MiST > 0.8
baits_gene = baits_gene[baits_gene['MIST'] > 0.8]

# Keep only necessary columns
baits_gene = baits_gene[['Bait', 'PreyGene']]

# Process data
baits_gene['Bait'] = 'bait_' + baits_gene['Bait']
baits_gene.replace({'PreyGene': codes['gene_symbol2id']}, inplace=True)
baits_gene['PreyGene'] = pd.to_numeric(baits_gene['PreyGene'], errors='coerce')
baits_gene.dropna(inplace=True)
baits_gene['PreyGene'] = 'gene_' + baits_gene['PreyGene'].astype(int).astype('string')

baits_gene.head()

Unnamed: 0,Bait,PreyGene
9,bait_SARS-CoV2 E,gene_8546
56,bait_SARS-CoV2 E,gene_23476
187,bait_SARS-CoV2 E,gene_6046
562,bait_SARS-CoV2 E,gene_10283
685,bait_SARS-CoV2 E,gene_6391


## Biological pathways

Load KEGG Pathways

In [7]:
# Using KEGG pathway provided by author
pathways_kegg=pd.read_csv(data_path+'biology-database/KegglinkevaluationPPPN_1', header=None,sep='\t')
pathways_kegg.columns=['gene1', 'gene2', 'label']

# keep only PP labelled
pathways_kegg = pathways_kegg[pathways_kegg['label']=='PP']
pathways_kegg = pathways_kegg[['gene1', 'gene2']]

# convert gene to symbol to stay consistent with the other datasets
pathways_kegg.replace({"gene1": codes["gene_id2symbol"], "gene2": codes["gene_id2symbol"]}, inplace=True)
pathways_kegg.dropna(inplace=True)

pathways_kegg.head()

Unnamed: 0,gene1,gene2
0,10725,TNF
1,10725,CSF2
2,NFATC1,TNF
3,NFATC1,CSF2
4,NFATC2,TNF


Load CTD Pathways

In [8]:
pathways=pd.read_csv(data_path+'CTD/pathways.csv')
pathways_ctd_temp = []
for _, row in pathways.iterrows():
    this_pathway = row["Association inferred via"].split("|")
    pathway_combos = combinations(this_pathway, 2)
    pathways_ctd_temp.extend(pathway_combos)

pathways_ctd = pd.DataFrame(pathways_ctd_temp, columns=['gene1', 'gene2'])

pathways_ctd.head()

Unnamed: 0,gene1,gene2
0,ATP11A,BTK
1,ATP11A,CCL2
2,ATP11A,CCL3
3,ATP11A,CCR2
4,ATP11A,CD209


Load PHARMAGKB Pathways
Pathways were obtained from https://www.pharmgkb.org/page/COVID, Section "Pathways involving candidate genes/drugs"

In [9]:
pathways = {}
pathways["ace"] = ["ACE", "ACE2", "AGT", "AGTR1", "AGTR2", "ATP6AP2", "BDKRB1", "BDKRB2", "CMA1", "CTSG", "CYP11B2", "KNG1", "MAPK1", "MAPK3", "MAS1", "MME", "NOS3", "NR3C2", "REN", "TGFB1"]
pathways["renin"] = ["ACE", "ACE2", "AGT", "AGTR1", "AGTR2", "ATP6AP2", "BDKRB1", "BDKRB2", "CMA1", "CTSG", "CYP11B2", "KNG1", "MAPK1", "MAPK3", "MAS1", "MME", "NOS3", "NR3C2", "REN", "TGFB1"]
pathways["losartan"] = ["AGTR1", "CYP2C9", "CYP3A4", "UGT1A1", "UGT2B7"]
pathways["antiarrhymtic"] = ["ABCC8", "ABCC9", "ADRA1D", "ADRA2A", "ADRB1", "ADRB2", "ANK2", "ATP1A1", "ATP2A1", "ATP2A2", "CACNA1C", "CACNA1D", "CACNA1G", "CACNA1H", "CACNB2", "CASQ1", "CASQ2", "CHRM2", "DSP", "FKBP1B", "GJA1", "GJA5", "GJD3", "HCN2", "HCN4", "JUP", "KCNA5", "KCNAB2", "KCND3", "KCNE1", "KCNE2", "KCNE3", "KCNE4", "KCNE5", "KCNH2", "KCNIP2", "KCNJ11", "KCNJ2", "KCNJ3", "KCNJ4", "KCNJ5", "KCNJ8", "KCNK1", "KCNK3", "KCNQ1", "LMNA", "PLN", "RYR2", "SCN1B", "SCN2B", "SCN3B", "SCN4B", "SCN5A", "SLC8A2", "SLC8A3"]
pathways["fluvoxamine"] = ["ADH1A", "ADH1B", "ADH1C", "CYP1A2", "CYP2C19", "CYP2D6", "CYP3A"]

pathways_pgkb_temp = []
for pathway_name in pathways.keys():
    this_pathway_combo = combinations(pathways[pathway_name],2)
    pathways_pgkb_temp.extend(this_pathway_combo)

pathways_pgkb = pd.DataFrame(pathways_pgkb_temp, columns=['gene1', 'gene2'])

pathways_pgkb.head()

Unnamed: 0,gene1,gene2
0,ACE,ACE2
1,ACE,AGT
2,ACE,AGTR1
3,ACE,AGTR2
4,ACE,ATP6AP2


Combine all pathways from CTD, KEGG, PHARMKGB

In [10]:
all_pathways = pd.concat([pathways_ctd, pathways_kegg, pathways_pgkb])

# symbol to id
all_pathways.replace({'gene1': codes['gene_symbol2id'], 'gene2': codes['gene_symbol2id']}, inplace=True)
all_pathways['gene1'] = pd.to_numeric(all_pathways['gene1'], errors='coerce')
all_pathways['gene2'] = pd.to_numeric(all_pathways['gene2'], errors='coerce')
all_pathways.dropna(inplace=True)
all_pathways['gene1'] = "gene_" + all_pathways['gene1'].astype(int).astype("string")
all_pathways['gene2'] = 'gene_' + all_pathways['gene2'].astype(int).astype("string")
all_pathways.drop_duplicates(inplace=True)

all_pathways.head()

Unnamed: 0,gene1,gene2
0,gene_23250,gene_695
1,gene_23250,gene_6347
2,gene_23250,gene_6348
3,gene_23250,gene_729230
4,gene_23250,gene_30835


## Gene/drug-phenotype interactions 

In [11]:
# Load phenotype
phenotypes=pd.read_csv(data_path+'CTD/phenotype.csv')
phenotypes = phenotypes[["Phenotype Term Name", "Phenotype Term ID", "Chemical Inference Network", "Gene Inference Network"]] # only keep columns we're interested

#store code
codes['phenotype_id2name']={row['Phenotype Term ID']:row['Phenotype Term Name'] for _, row in phenotypes[['Phenotype Term ID','Phenotype Term Name']].drop_duplicates().iterrows()}

Phenotype/Drug interactions

In [12]:
phenotypes['Chemical Inference Network'] = phenotypes['Chemical Inference Network'].str.split('|')
phenotypes_drugs = phenotypes.explode('Chemical Inference Network')
phenotypes_drugs['Chemical Inference Network'] = phenotypes_drugs['Chemical Inference Network'].str.upper()

phenotypes_drugs.dropna(inplace=True)
phenotypes_drugs['Phenotype ID'] = "phenotype_" + phenotypes_drugs['Phenotype Term ID'].astype("string")
phenotypes_drugs['drug'] = "drug_" + phenotypes_drugs['Chemical Inference Network']
phenotypes_drugs = phenotypes_drugs[['Phenotype ID', 'drug']]

phenotypes_drugs

Unnamed: 0,Phenotype ID,drug
0,phenotype_GO:0008283,drug_AZITHROMYCIN
0,phenotype_GO:0008283,drug_BETULINIC ACID
0,phenotype_GO:0008283,drug_CAMOSTAT
0,phenotype_GO:0008283,drug_CEPHARANTHINE
0,phenotype_GO:0008283,drug_CHLOROQUINE
...,...,...
367,phenotype_GO:0007611,drug_ZINC
368,phenotype_GO:0046960,drug_NIFEDIPINE
369,phenotype_GO:0050996,drug_CHOLINE
370,phenotype_GO:0035702,drug_ACTIVE HEXOSE CORRELATED COMPOUND


Phenotype/gene interaction

In [13]:
phenotypes['gene'] = phenotypes['Gene Inference Network'].str.split('|')
phenotypes_genes = phenotypes.explode('gene')
phenotypes_genes.replace({'gene': codes["gene_symbol2id"]}, inplace=True)
phenotypes_genes.dropna(inplace=True)

phenotypes_genes['Phenotype ID'] = "phenotype_" + phenotypes_genes['Phenotype Term ID'].astype("string")
phenotypes_genes['gene'] = "gene_" + phenotypes_genes['gene'].astype(int).astype("string")
phenotypes_genes = phenotypes_genes[['Phenotype ID', 'gene']]
phenotypes_genes

Unnamed: 0,Phenotype ID,gene
0,phenotype_GO:0008283,gene_183
0,phenotype_GO:0008283,gene_7297
1,phenotype_GO:0006915,gene_695
1,phenotype_GO:0006915,gene_3553
1,phenotype_GO:0006915,gene_3559
...,...,...
367,phenotype_GO:0007611,gene_3553
368,phenotype_GO:0046960,gene_695
369,phenotype_GO:0050996,gene_3553
370,phenotype_GO:0035702,gene_6347


## Map all gene/drug/phenotype/baits

In [14]:

baits_gene.columns=['node1','node2']
baits_gene['type']='bait-gene'

gene_drug.columns=['node1', 'node2']
gene_drug['type']='gene-drug'

all_pathways.columns=['node1', 'node2']
all_pathways['type']='gene-gene'

phenotypes_genes.columns=['node1', 'node2']
phenotypes_genes['type']='phenotype-gene'

phenotypes_drugs.columns=['node1','node2']
phenotypes_drugs['type']='phenotype-drug'

edges = pd.concat([gene_drug, all_pathways, baits_gene, phenotypes_genes, phenotypes_drugs])

In [15]:
# Create labeler
le = LabelEncoder()
le.fit(pd.concat([edges.node1, edges.node2]))

# convert the strings to labels 
edges.node1 = le.transform(edges.node1)
edges.node2 = le.transform(edges.node2)

## Embed DRKG

Get [DRKG](https://github.com/gnn4dr/DRKG)

In [16]:
# get dataset from https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz
# Place downloaded /embed folder in ./data/DRKG folder

#Load DRKG files 
drkg = np.load(data_path + 'DRKG/embed/DRKG_TransE_l2_entity.npy')
drkg_shape = drkg.shape[1]
entity_df = pd.read_csv(data_path + 'DRKG/embed/entities.tsv', sep='\t', names=['value', 'id'])
relation_df = pd.read_csv(data_path + 'DRKG/embed/relations.tsv', sep='\t', names=['value', 'id'])

# Determine language used by DRKG
entity_df["node_type"] = entity_df.value.str.split("::")
entity_df.node_type = entity_df.node_type.str[0]
entity_df.node_type.unique()

# Create mapping for DRKG entity and relation
entity_val2id = {}
relation_val2id = {}
drugbank = {}

for _, row in entity_df.iterrows():
    entity_val2id[row['value']] = row['id']
    
for _, row in relation_df.iterrows():
    relation_val2id[row['value']] = row['id']

# Create drkg drug mapping
alldrugbank = pd.read_csv(data_path + 'CTD/alldrugbank.csv', index_col=0).rename(columns={'drugbank_id':'id'})
nondrugbank = pd.read_csv(data_path + 'CTD/nondrugbank.csv', index_col=0).rename(columns={'chembl':'id'})
drugbank_df = pd.concat([alldrugbank, nondrugbank])

drugbank_df.drop_duplicates('id', keep='first', inplace=True)
for _, row in drugbank_df.iterrows():
    drugbank[row['drugname']] = row['id']

Write our processed graph in DRKG's format

In [17]:
gene_drkg = []
phenotype_drkg = []
bait_drkg = []
drug_drkg = []

for this_class in le.classes_:
    class_items = this_class.split('_')
    class_type = class_items[0]
    class_val = class_items[1]
    if class_type == 'gene':
        gene_drkg.append('Gene::' + class_val)
    elif class_type == 'phenotype':
        phenotype_drkg.append('Biological Process::' + class_val)
    elif class_type == 'bait':
        bait_drkg.append('Disease::' + class_val)
    elif class_type == 'drug':
        drug_drkg.append('Compound::'+drugbank.get(class_val, ''))

Map own data to DRKG

In [19]:
# handle the ID mapping - from original author's code
bait_ids = []
gene_ids = []
drug_ids = []
phenotype_ids = []

    
for bait in bait_drkg:
    bait_ids.append(entity_val2id.get(bait))

for gene in gene_drkg:
    gene_ids.append(entity_val2id.get(gene))
    
for drug in drug_drkg:
    drug_ids.append(entity_val2id.get(drug))
    
for phenotype in phenotype_drkg:
    phenotype_ids.append(entity_val2id.get(phenotype))

bait_emb=np.array([drkg[bait_id] if bait_id is not None else np.zeros(drkg_shape) for bait_id in bait_ids ])
drug_emb=np.array([drkg[drug_id] if drug_id is not None else np.zeros(drkg_shape) for drug_id in drug_ids ])
gene_emb=np.array([drkg[gene_id] if gene_id is not None else np.zeros(drkg_shape) for gene_id in gene_ids ])
phenotype_emb=np.array([drkg[phenotype_id] if phenotype_id is not None else np.zeros(drkg_shape) for phenotype_id in phenotype_ids ])

# Generate node features
node_features=np.concatenate((bait_emb, drug_emb, gene_emb, phenotype_emb))

## Save as pickle

In [20]:
edges.to_pickle(data_path+'edge_index_'+exp_id+'.pkl')
pickle.dump(le, open(data_path+'LabelEncoder_'+exp_id+'.pkl','wb'))
pickle.dump(node_features, open(data_path+'node_feature_'+exp_id+'.pkl', 'wb'))
pickle.dump(codes, open(data_path+'codes_'+exp_id+'.pkl','wb'))