# Computational prediction of drug-tager interactions

## Exploratory Data Analysis

Original article: Computational prediction of drug–target interactions using chemogenomic approaches: an empirical survey, A. Ezzat, others.
<br>Data link: http://web.kuicr.kyoto-u.ac.jp/supp/yoshi/drugtarget/

<p>

Data supplement. Organic molecules (Qm9 file): https://deepchemdata.s3-us-west-.amazonaws.com/datasets/molnet_publish/qm9.zip


## 1 - Pre-setup

### 1.1 - Imports (dependencies)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time


from io import StringIO #retrive information for mlflow
import sys #retrive information for mlflow

#Chemistry Libraries
from rdkit import Chem
from rdkit.Chem import AllChem



# Pubchem DB API https://pubchem.ncbi.nlm.nih.gov/compound/5388962
import pubchempy as pcp # to retrive features and SMILES

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


## 2 - Data imports

### 2.1 - Predicted drug-target interaction networks

Cinq types de données.
    <ul>Predicted compound-protein interacion pairs</ul>
    <ul>Binary relation list of the gold standard drug-target interaction data</ul>
    <ul>Adjacency matrix of the gold standard drug-target interaction data</ul>
    <ul>Compound structure similarity matrix</ul>
    <ul>Protein sequence similarity matrix</ul>

In [2]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir

ligants_type=['enzyme','GPCR','enzyme','nuclear_receptor']

### 2.1.1 - Nuclear Receptor

In [3]:
#Enzyme

ltype=ligants_type[0]

#set tables
files_matrix_temp={'df_adjacency_matrix_enzyme_Y':'e_admat_dgc.txt',
       'df_similarity_matrix_enzyme_compound_St':'e_simmat_dc.txt',
       'df_similarity_matrix_enzyme_protein_Sd':'e_simmat_dg.txt',
       }

df_temp_matrix={}


for df_name, file_name in files_matrix_temp.items():
    # Construct the file path using base_dir
    file_path = os.path.join(base_dir,'data','split',ltype, file_name)

    try:
        # Read the file
        print("Trying to read file at:", file_path) # Print the path for verification
        data_frame = pd.read_csv(file_path, delimiter='\t', index_col=0)
        df_temp_matrix[df_name] = data_frame
    except FileNotFoundError:
        print(f'File not found at the specified path: {file_path}')

df_adjacency_matrix_enzyme_Y=df_temp_matrix['df_adjacency_matrix_enzyme_Y']
df_similarity_matrix_enzyme_compound_St=df_temp_matrix['df_similarity_matrix_enzyme_compound_St']
df_similarity_matrix_enzyme_protein_Sd=df_temp_matrix['df_similarity_matrix_enzyme_protein_Sd']

Trying to read file at: C:\Users\riskf\OneDrive\DrugTargetSmilesBERT\data\split\enzyme\e_admat_dgc.txt
Trying to read file at: C:\Users\riskf\OneDrive\DrugTargetSmilesBERT\data\split\enzyme\e_simmat_dc.txt
Trying to read file at: C:\Users\riskf\OneDrive\DrugTargetSmilesBERT\data\split\enzyme\e_simmat_dg.txt


In [4]:
#Adjacent matrix. Y
print('Lines (m): {}'.format(df_adjacency_matrix_enzyme_Y.shape[0]))
print('Columns (n): {}'.format(df_adjacency_matrix_enzyme_Y.shape[1]))
print('Size (m x n): {}'.format(df_adjacency_matrix_enzyme_Y.size))

number_interactions_enzimes=(df_adjacency_matrix_enzyme_Y.values == 1).sum()
print('Known interactions: {}'.format(number_interactions_enzimes))
print('Known interactions (%): {:.4f}%'.format(number_interactions_enzimes/df_adjacency_matrix_enzyme_Y.size*100))
print('No interactions: {}'.format(df_adjacency_matrix_enzyme_Y.size-number_interactions_enzimes))
print('No interactions(%): {:.4f}%'.format((df_adjacency_matrix_enzyme_Y.size-number_interactions_enzimes)/df_adjacency_matrix_enzyme_Y.size*100))


#print(df_adjacency_matrix_enzyme_Y.head(5))

Lines (m): 664
Columns (n): 445
Size (m x n): 295480
Known interactions: 2926
Known interactions (%): 0.9903%
No interactions: 292554
No interactions(%): 99.0097%


In [5]:
#Similarity Matrix Compound Columns
print('Lines (m): {}'.format(df_similarity_matrix_enzyme_compound_St.shape[0]))
print('Columns (n): {}'.format(df_similarity_matrix_enzyme_compound_St.shape[1]))
print('Size (m x n): {}'.format(df_similarity_matrix_enzyme_compound_St.size))

#print(df_simmilarity_matrix_enzyme_compound_Sd.head(5))

Lines (m): 445
Columns (n): 445
Size (m x n): 198025


In [6]:
#Similarity Matrix Human Proteins Lines
print('Lines (m): {}'.format(df_similarity_matrix_enzyme_protein_Sd.shape[0]))
print('Columns (n): {}'.format(df_similarity_matrix_enzyme_protein_Sd.shape[1]))
print('Size (m x n): {}'.format(df_similarity_matrix_enzyme_protein_Sd.size))

#print(df_simmilarity_matrix_enzyme_protein_St.head(5))

Lines (m): 664
Columns (n): 664
Size (m x n): 440896


### Logistic matrix factorization

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics import log_loss
import scipy.optimize as opt

K = 50  # Dimensionality of the latent feature space

Y = csr_matrix(df_adjacency_matrix_enzyme_Y.values).toarray()  # Convert to dense array for element-wise operations

# Initialize matrices A and B with small random values
A = np.random.rand(Y.shape[0], K) * 0.01
B = np.random.rand(Y.shape[1], K) * 0.01

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_loss_and_grad(A, B, Y):
    predictions = sigmoid(A.dot(B.T))
    loss = log_loss(Y.ravel(), predictions.ravel(), labels=[0,1])

    # Gradient computation
    error = predictions - Y
    grad_A = error.dot(B)
    grad_B = error.T.dot(A)

    return loss, grad_A, grad_B

def update_matrices(A, B, Y, learning_rate=0.01, max_iter=100):
    for _ in range(max_iter):
        loss, grad_A, grad_B = logistic_loss_and_grad(A, B, Y)
        A -= learning_rate * grad_A
        B -= learning_rate * grad_B
        print(f"Loss: {loss}")

    return A, B

# Fit the model
A, B = update_matrices(A, B, Y)

# Calculate the complete interaction matrix using the factorized matrices
Y_complete = sigmoid(np.dot(A, B.T))

# Generating the final dataset
def generate_final_dataset(A, B, Y):
    final_dataset = []
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            features_drug = A[i, :]
            features_target = B[j, :]
            interaction_class = Y[i, j]
            final_dataset.append(np.concatenate([features_drug, features_target, [interaction_class]]))
    return np.array(final_dataset)


In [8]:
##OUTPUT Matrix. Important for next steps.
Y_dense = Y.toarray() if isinstance(Y, csr_matrix) else Y  # Convert to dense if Y is sparse
final_dataset = generate_final_dataset(A, B, Y_dense)  # Generate the dataset
final_df = pd.DataFrame(final_dataset)

file_name = f'final_new_par_LMF_{K}.csv'
file_path = os.path.join(base_dir, 'data', 'split', ltype, file_name)
final_df.to_csv(file_path, index=False)
print(f"Final dataset saved at {file_path}!")

Final dataset saved at C:\Users\riskf\OneDrive\DrugTargetSmilesBERT\data\split\enzyme\final_new_par_LMF_50.csv!


In [9]:
print(final_df)

             0         1         2         3         4         5         6    \
0       0.378580  0.279222  0.529452  0.224391  0.269244  0.288321  0.116587   
1       0.378580  0.279222  0.529452  0.224391  0.269244  0.288321  0.116587   
2       0.378580  0.279222  0.529452  0.224391  0.269244  0.288321  0.116587   
3       0.378580  0.279222  0.529452  0.224391  0.269244  0.288321  0.116587   
4       0.378580  0.279222  0.529452  0.224391  0.269244  0.288321  0.116587   
...          ...       ...       ...       ...       ...       ...       ...   
295475  0.233042  0.225892  0.219956  0.389313  0.257460  0.331327  0.301114   
295476  0.233042  0.225892  0.219956  0.389313  0.257460  0.331327  0.301114   
295477  0.233042  0.225892  0.219956  0.389313  0.257460  0.331327  0.301114   
295478  0.233042  0.225892  0.219956  0.389313  0.257460  0.331327  0.301114   
295479  0.233042  0.225892  0.219956  0.389313  0.257460  0.331327  0.301114   

             7         8         9    .