In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem as Chem
from PyBioMed import Pyprotein
from PyBioMed.PyProtein import CTD
from sklearn.decomposition import PCA
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from deepctr.models import AutoInt
from deepctr.feature_column import SparseFeat,DenseFeat,get_feature_names
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.optimizers import Adam,Adagrad,Adamax
from tensorflow import keras
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
mms = MinMaxScaler(feature_range=(0,1))

In [3]:
#protein featrues
df_proseq = pd.read_csv('data/transport_pro_seq.txt',sep='\t')
df_proseq.columns = ['pro_id','seq']
pro_id = df_proseq['pro_id']
pro_feats = np.loadtxt('data/dt_pro_ctd.txt',delimiter=',')
prodes_df = pd.concat([pro_id,pd.DataFrame(pro_feats)],axis=1)

In [4]:
natural_product = pd.read_csv("data/natural_product/Luteolin.csv")
natural_product.set_index(["pubchem_id"], inplace=True) 
natural_product.head()

Unnamed: 0_level_0,NP,smiles
pubchem_id,Unnamed: 1_level_1,Unnamed: 2_level_1
drug_5280445,Luteolin,C1=CC(=C(C=C1C2=CC(=O)C3=C(C=C(C=C3O2)O)O)O)O


In [5]:
smi = natural_product["smiles"]["drug_5280445"]
smi

'C1=CC(=C(C=C1C2=CC(=O)C3=C(C=C(C=C3O2)O)O)O)O'

In [6]:
m = Chem.MolFromSmiles(smi)
fp = Chem.GetMorganFingerprintAsBitVect(m,2,1024)

In [7]:
mol_name = list(["drug_5280445"])

In [8]:
fp_id = pd.DataFrame(mol_name)
fp_id = fp_id.rename(columns={0:'drug_id'})

In [9]:
fp_df = pd.DataFrame(np.array(list(fp))).T

In [10]:
fp_df = pd.concat([fp_id,pd.DataFrame(np.array(list(fp))).T],axis=1)

In [11]:
fp_df

Unnamed: 0,drug_id,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,drug_5280445,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
#3.Encode head and tail for nfm input
head_le = LabelEncoder()
tail_le = LabelEncoder()
head_le.fit(df_proseq['pro_id'].values)
tail_le.fit(fp_id["drug_id"].values)

LabelEncoder()

In [13]:
dataset = pd.DataFrame(pro_id)
dataset = dataset.rename(columns={'pro_id':'head'})

In [14]:
dataset["relation"] = "drug"
dataset["tail"] = "drug_5280445"

In [15]:
drug_features = pd.merge(dataset,fp_df,how='left',left_on='tail',right_on='drug_id').iloc[:,4:1029].values
pro_features = pd.merge(dataset,prodes_df,how='left',left_on='head',right_on='pro_id').iloc[:,4:105].values

In [16]:
len(drug_features)

423

In [17]:
feature = np.concatenate([drug_features,pro_features],axis=1)

In [18]:
feature = np.concatenate([drug_features,pro_features],axis=1)
pca = PCA(n_components=400)
scaled_pca_features = pca.fit_transform(feature)

DeepCTR version 0.9.0 detected. Your version is 0.8.4.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.0


In [19]:
embedding_list = pd.read_csv("data/natural_product/Luteolin_RESCAL_entity.csv")
embedding_list.set_index(["ent_name"], inplace=True) 

In [20]:
sub_embeddings = [eval(embedding_list['ent_embedding'][x]) for x in dataset['head']]
obj_embeddings = [eval(embedding_list['ent_embedding'][x]) for x in dataset['tail']]

In [21]:
merge_feats = np.concatenate([sub_embeddings,obj_embeddings],axis=1)

In [22]:
all_feats = np.concatenate([scaled_pca_features,merge_feats],axis=1)
all_feats_scaled = mms.fit_transform(all_feats)

In [23]:
predict_model_input = {'head':head_le.transform(dataset['head'].values),
                    'tail':tail_le.transform(dataset['tail'].values),
                    'feats':all_feats_scaled
                    }

In [24]:
from deepctr.layers import custom_objects

autoint_model = tf.keras.models.load_model("model/all_pos_model.h5", custom_objects)

In [25]:
pred_y = autoint_model.predict(predict_model_input, batch_size=64)

In [26]:
pred_df = pd.DataFrame(pred_y)

In [27]:
result_df = pd.concat([pro_id,pred_df],axis=1)

In [28]:
result_df.sort_values(by=[0], ascending=False,inplace=True)

In [29]:
result_df[:10]

Unnamed: 0,pro_id,0
22,dt_O76082,1.0
1,dt_Q8TCC7,1.0
6,dt_Q9UNQ0,1.0
7,dt_Q96FL8,1.0
9,dt_P33527,1.0
13,dt_O15439,1.0
32,dt_Q15758,1.0
187,dt_Q9NVC3,1.0
5,dt_Q4U2R8,0.999999
12,dt_O15245,0.999999
