# Computational prediction of drug-tager interactions

## Exploratory Data Analysis

Original article: Computational prediction of drug–target interactions using chemogenomic approaches: an empirical survey, A. Ezzat, others.
<br>Data link: http://web.kuicr.kyoto-u.ac.jp/supp/yoshi/drugtarget/

<p>

Data supplement. Organic molecules (Qm9 file): https://deepchemdata.s3-us-west-.amazonaws.com/datasets/molnet_publish/qm9.zip


## 1 - Pre-setup

### 1.1 - Imports (dependencies)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time



from io import StringIO #retrive information for mlflow
import sys #retrive information for mlflow




#Chemistry Libraries
from rdkit import Chem
from rdkit.Chem import AllChem



# Pubchem DB API https://pubchem.ncbi.nlm.nih.gov/compound/5388962
import pubchempy as pcp # to retrive features and SMILES

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


## 2 - Data imports

### 2.1 - Predicted drug-target interaction networks

Cinq types de données.
    <ul>Predicted compound-protein interacion pairs</ul>
    <ul>Binary relation list of the gold standard drug-target interaction data</ul>
    <ul>Adjacency matrix of the gold standard drug-target interaction data</ul>
    <ul>Compound structure similarity matrix</ul>
    <ul>Protein sequence similarity matrix</ul>

In [2]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir

ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']

### 2.1.1 - Nuclear Receptor

In [3]:
#GPCR

ltype=ligants_type[1]

#set tables
files_matrix_temp={'df_adjacency_matrix_GPCR_Y':'gpcr_admat_dgc.txt',
       'df_similarity_matrix_GPCR_compound_St':'gpcr_simmat_dc.txt',
       'df_similarity_matrix_GPCR_protein_Sd':'gpcr_simmat_dg.txt',
       }

df_temp_matrix={}


for df_name, file_name in files_matrix_temp.items():
    # Construct the file path using base_dir
    file_path = os.path.join(base_dir,'data','split',ltype, file_name)

    try:
        # Read the file
        print("Trying to read file at:", file_path) # Print the path for verification
        data_frame = pd.read_csv(file_path, delimiter='\t', index_col=0)
        df_temp_matrix[df_name] = data_frame
    except FileNotFoundError:
        print(f'File not found at the specified path: {file_path}')

df_adjacency_matrix_GPCR_Y=df_temp_matrix['df_adjacency_matrix_GPCR_Y']
df_similarity_matrix_GPCR_compound_St=df_temp_matrix['df_similarity_matrix_GPCR_compound_St']
df_similarity_matrix_GPCR_protein_Sd=df_temp_matrix['df_similarity_matrix_GPCR_protein_Sd']

Trying to read file at: C:\Users\riskf\OneDrive\DrugTargetSmilesBERT\data\split\GPCR\gpcr_admat_dgc.txt
Trying to read file at: C:\Users\riskf\OneDrive\DrugTargetSmilesBERT\data\split\GPCR\gpcr_simmat_dc.txt
Trying to read file at: C:\Users\riskf\OneDrive\DrugTargetSmilesBERT\data\split\GPCR\gpcr_simmat_dg.txt


In [4]:
#Adjacent matrix. Y
print('Lines (m): {}'.format(df_adjacency_matrix_GPCR_Y.shape[0]))
print('Columns (n): {}'.format(df_adjacency_matrix_GPCR_Y.shape[1]))
print('Size (m x n): {}'.format(df_adjacency_matrix_GPCR_Y.size))

number_interactions_enzimes=(df_adjacency_matrix_GPCR_Y.values == 1).sum()
print('Known interactions: {}'.format(number_interactions_enzimes))
print('Known interactions (%): {:.4f}%'.format(number_interactions_enzimes/df_adjacency_matrix_GPCR_Y.size*100))
print('No interactions: {}'.format(df_adjacency_matrix_GPCR_Y.size-number_interactions_enzimes))
print('No interactions(%): {:.4f}%'.format((df_adjacency_matrix_GPCR_Y.size-number_interactions_enzimes)/df_adjacency_matrix_GPCR_Y.size*100))


#print(df_adjacency_matrix_GPCR_Y.head(5))

Lines (m): 95
Columns (n): 223
Size (m x n): 21185
Known interactions: 635
Known interactions (%): 2.9974%
No interactions: 20550
No interactions(%): 97.0026%


In [5]:
#Similarity Matrix Compound Columns
print('Lines (m): {}'.format(df_similarity_matrix_GPCR_compound_St.shape[0]))
print('Columns (n): {}'.format(df_similarity_matrix_GPCR_compound_St.shape[1]))
print('Size (m x n): {}'.format(df_similarity_matrix_GPCR_compound_St.size))

#print(df_simmilarity_matrix_GPCR_compound_Sd.head(5))

Lines (m): 223
Columns (n): 223
Size (m x n): 49729


In [6]:
#Similarity Matrix Human Proteins Lines
print('Lines (m): {}'.format(df_similarity_matrix_GPCR_protein_Sd.shape[0]))
print('Columns (n): {}'.format(df_similarity_matrix_GPCR_protein_Sd.shape[1]))
print('Size (m x n): {}'.format(df_similarity_matrix_GPCR_protein_Sd.size))

#print(df_simmilarity_matrix_GPCR_protein_St.head(5))

Lines (m): 95
Columns (n): 95
Size (m x n): 9025


### Logistic matrix factorization

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics import log_loss
import scipy.optimize as opt

K = 50  # Dimensionality of the latent feature space


Y = csr_matrix(df_adjacency_matrix_GPCR_Y.values).toarray()  # Convert to dense array for element operations

# Initialize matrices A and B with small random values
A = np.random.rand(Y.shape[0], K) * 0.01
B = np.random.rand(Y.shape[1], K) * 0.01

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_loss_and_grad(A, B, Y):
    predictions = sigmoid(A.dot(B.T))
    loss = log_loss(Y.ravel(), predictions.ravel(), labels=[0,1])

    # Gradient computation
    error = predictions - Y
    grad_A = error.dot(B)
    grad_B = error.T.dot(A)

    return loss, grad_A, grad_B

def update_matrices(A, B, Y, learning_rate=0.01, max_iter=100):
    for _ in range(max_iter):
        loss, grad_A, grad_B = logistic_loss_and_grad(A, B, Y)
        A -= learning_rate * grad_A
        B -= learning_rate * grad_B
        print(f"Loss: {loss}")

    return A, B

# Fit the model
A, B = update_matrices(A, B, Y)

# Calculate the complete interaction matrix using the factorized matrices
Y_complete = sigmoid(np.dot(A, B.T))

# Generating the final dataset
def generate_final_dataset(A, B, Y):
    final_dataset = []
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            features_drug = A[i, :]
            features_target = B[j, :]
            interaction_class = Y[i, j]
            final_dataset.append(np.concatenate([features_drug, features_target, [interaction_class]]))
    return np.array(final_dataset)


Loss: 0.6937359894406214
Loss: 0.6931273986222292
Loss: 0.6929267727741995
Loss: 0.6925046674548783
Loss: 0.6913190976880532
Loss: 0.6879595484208064
Loss: 0.6785190998487731
Loss: 0.6526525972352707
Loss: 0.5866768594138374
Loss: 0.44855498883307626
Loss: 0.27005361936980454
Loss: 0.16812724687923258
Loss: 0.1354147370693603
Loss: 0.12320513084769352
Loss: 0.11723416128476756
Loss: 0.11376325478302614
Loss: 0.11151875309769903
Loss: 0.10996033262414187
Loss: 0.10882108122486892
Loss: 0.10795476691421829
Loss: 0.10727512571700004
Loss: 0.10672830161341132
Loss: 0.10627909299453787
Loss: 0.10590358925874323
Loss: 0.1055850047947189
Loss: 0.10531121251487327
Loss: 0.10507322601785624
Loss: 0.10486423330123498
Loss: 0.10467896258380817
Loss: 0.10451325433189679
Loss: 0.10436376484484844
Loss: 0.10422775582648804
Loss: 0.10410294135627357
Loss: 0.10398737386640325
Loss: 0.10387935699457139
Loss: 0.1037773771155367
Loss: 0.10368004787427565
Loss: 0.10358606368675102
Loss: 0.1034941592637981

In [8]:
##OUTPUT Matrix. Important for next steps.
Y_dense = Y.toarray() if isinstance(Y, csr_matrix) else Y  # Convert to dense if Y is sparse
final_dataset = generate_final_dataset(A, B, Y_dense)  # Generate the dataset
final_df = pd.DataFrame(final_dataset)

file_name = f'final_new_par_LMF_{K}.csv'
file_path = os.path.join(base_dir, 'data', 'split', ltype, file_name)
final_df.to_csv(file_path, index=False)
print(f"Final dataset saved at {file_path}!")

Final dataset saved at C:\Users\riskf\OneDrive\DrugTargetSmilesBERT\data\split\GPCR\final_new_par_LMF_50.csv!


In [9]:
print(final_df)

            0         1         2         3         4         5         6    \
0     -0.622757 -0.427074 -0.546902 -0.681086 -0.616897 -0.399852 -0.646469   
1     -0.622757 -0.427074 -0.546902 -0.681086 -0.616897 -0.399852 -0.646469   
2     -0.622757 -0.427074 -0.546902 -0.681086 -0.616897 -0.399852 -0.646469   
3     -0.622757 -0.427074 -0.546902 -0.681086 -0.616897 -0.399852 -0.646469   
4     -0.622757 -0.427074 -0.546902 -0.681086 -0.616897 -0.399852 -0.646469   
...         ...       ...       ...       ...       ...       ...       ...   
21180 -0.622019 -0.426315 -0.545463 -0.684833 -0.615999 -0.395611 -0.650696   
21181 -0.622019 -0.426315 -0.545463 -0.684833 -0.615999 -0.395611 -0.650696   
21182 -0.622019 -0.426315 -0.545463 -0.684833 -0.615999 -0.395611 -0.650696   
21183 -0.622019 -0.426315 -0.545463 -0.684833 -0.615999 -0.395611 -0.650696   
21184 -0.622019 -0.426315 -0.545463 -0.684833 -0.615999 -0.395611 -0.650696   

            7         8         9    ...       91  