In [2]:
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem import Descriptors
from rdkit.Chem import rdFMCS
from rdkit.Chem.rdmolops import RDKFingerprint
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
from rdkit.Avalon.pyAvalonTools import GetAvalonFP
import numpy as np
import pandas as pd
import pubchempy as pc

ModuleNotFoundError: No module named 'pubchempy'

In [None]:
def get_similarity(a,b):
    return DataStructs.FingerprintSimilarity(a, b, metric=DataStructs.TanimotoSimilarity)


# import ellie's function 
def input_to_compound(PubChemSID:str, master_df:pd.Dataframe):
    """
    input_to_compound() calculates the distance from input compound to 
        all products and create a new column. 
    Additionally, calculates the average distance between input compound
    and products for each promiscuous enzyme. 
    
    
    Args: 
        PubChemSID (str): contains PubChemSID to be queried
        master_df (pd.Dataframe): pre-calculated model dataframe 
               
    Returns:
        pd.Dataframe: master_df with two new columns 
        1) that contains distance from input compound to all products
        2) that contains avg distance from input compound to products for each promiscuous enzyme
    """
    
    input_SMILES = sid_to_smiles(PubChemSID) #Ellie's function 
    input_fingerprint = FingerprintMols.FingerprintMol(Chem.rdmolfiles.MolFromSmiles(input_SMILES))
    dist_from_input = []
    avg_dist_from_input = []
    
    #get individual distance between input and all compounds 
    for i in master_df['Fingerprint']:
        dist_from_input.append(get_similarity(i,input_fingerprint))
    
    
    
    #get average distance between input and compounds for each promiscuous enzyme 
    for entry in np.unique(master_df['entry']):
        #filter dataframe with same enzyme 
        enzyme_split = master_df[master_df['entry']==entry]
        number_of_compounds = len(enzyme_split)
        total_distance = 0 
        
        for j in enzyme_split['Fingerprint']:
            total_distance += get_similarity(j,input_fingerprint)
        #get the average distance
        avg_distance = total_distance/number_of_compounds
        #create a list that contains number of compounds 
        avg_list = [avg_distance]*number_of_compounds
        avg_dist_from_input += avg_list 
        
        
    
    master_df['dist_from_input'] = dist_from_input 
    master_df['avg_dist_from_input'] = avg_dist_from_input 
    
    return master_df
    
    

In [27]:
df = pd.read_csv("../datasets/promiscuous_cleaned_KEGGtoPubChem.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,entry,KEGG,formula,PubChem
0,0,1.1.1.1,C00071,CHOR,3371.0
1,2,1.1.1.1,C00226,CH3OR,3526.0
2,3,1.1.1.1,C01450,COR2,4627.0
3,4,1.1.1.1,C01612,CH2OR2,4764.0
4,5,1.1.1.110,C03964,C9H10O4,6685.0
5,7,1.1.1.110,C00331,C11H9NO3,3625.0
6,8,1.1.1.110,C01179,C9H8O4,4406.0
7,10,1.1.1.110,C22006,C11H11NO3,
8,11,1.1.1.110,C05607,C9H10O3,7930.0
9,16,1.1.1.153,C02953,C9H13N5O3,5871.0


In [29]:
# this dataframe does not makes sense! but I just wanted to see my above function works! 
master_avg_dist = []
for entry in np.unique(df.entry):
    enzyme_split = df[df['entry']==entry]
    number_of_compounds = len(enzyme_split)
    distance = 0 
    for j in enzyme_split['PubChem']:
        distance += j 
    avgdist = distance/number_of_compounds
    avgdistlist = [avgdist]*number_of_compounds
    master_avg_dist += avgdistlist

df['master_avg_dist'] = master_avg_dist
    
df

Unnamed: 0.1,Unnamed: 0,entry,KEGG,formula,PubChem,master_avg_dist
0,0,1.1.1.1,C00071,CHOR,3371.0,4.072000e+03
1,2,1.1.1.1,C00226,CH3OR,3526.0,4.072000e+03
2,3,1.1.1.1,C01450,COR2,4627.0,4.072000e+03
3,4,1.1.1.1,C01612,CH2OR2,4764.0,4.072000e+03
4,5,1.1.1.110,C03964,C9H10O4,6685.0,
5,7,1.1.1.110,C00331,C11H9NO3,3625.0,
6,8,1.1.1.110,C01179,C9H8O4,4406.0,
7,10,1.1.1.110,C22006,C11H11NO3,,
8,11,1.1.1.110,C05607,C9H10O3,7930.0,
9,16,1.1.1.153,C02953,C9H13N5O3,5871.0,4.998250e+03
