# 3D Molecule Embedding

## Importing Libraries and Functions

In [157]:
import numpy as np
import embedding_3d as e3
import pandas as pd
from rdkit import Chem
import pickle


import h5py
from scipy.spatial import ConvexHull
from tqdm import tqdm as tq


import importlib

## Defining Constants

In [67]:
#Defining Directories
Activedatabase_Dir = (r'../active_database/Database_v4.xlsx')
#Defining constants (covalent radi in angstroms)===========================================================================================================
covalent_atomic_radi = {'C':0.76, 'N':0.71, 'H':0.31, 'F':0.71, 'O':0.66,
                        'S':1.05, 'Cl':1.02, 'Br':1.2, 'I':1.39, 'P':1.07}

atomic_mass = {'C':12.011, 'N':14.007, 'H':1.008, 'F':18.998, 'O':15.999,
               'S':32.06, 'Cl':35.45, 'Br':79.904, 'I':126.90, 'P':30.974}

atomic_colors = {'C':[1,1,1,.95], 'N':[0,0,1,.95], 'H':[1,1,1,.75], 'O':[1,0,0,.95], 
                 'S':[1,1,0,.95], 'Cl':[0,1,0,.95], 'Br':[0,1,0,.95], 'I':[0,1,0,.95], 'P':[0,1,0,.95]}

antibiotic_activity_cutoffs = {'MIC (uM)':[100,200], 'rZOI (mm)':[0,1]} #Cut-off values for determinign antibiotic activity ([cutoff between high and moderate activity, cutoff between moderate and no activity])
activity_hml_value = [1.5,1,0] #Value to represent high-activity, mid-activiy, and low activity

## Generating 3D embeddings

#### Reading Indoels from database

In [135]:
l_Indoles = e3.ParseDB_excel(Activedatabase_Dir, antibiotic_activity_cutoffs, [2,1,0]) #Parsing Database (using new database parsing function 11/18/23)

Parsing database sheets:
Experimental Values
Litrature Indole (Benz) Values
Litrature Quinilones
Performing initial molecule embedding


100%|██████████████████████████████████████████████████████████████████████████████████| 228/228 [00:03<00:00, 66.29it/s]
228it [00:00, 3259.53it/s]


#### Registering and embedding molecules

In [145]:
registered_Indoles, molecule_id_l = e3.mol_registration(l_Indoles, 0, 1) #Bulding new list of registered indoles with new conformers, 10 ideal

Attempting to register and generate conformers


228it [00:08, 26.93it/s]███████████████████████████████████████████████████████████████| 228/228 [00:08<00:00, 22.35it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 228/228 [00:08<00:00, 26.92it/s]


In [147]:
e3.molecule_3d_embedding_v2(registered_Indoles,
                         atomic_mass = atomic_mass,
                         volume_size_px=30,
                         batch_size=50,
                         out_dir = r'/home/jovyan/data/DataSetOutputs/test_h5_030225/030225_test_30.h5',
                         min_max_norm = True,
                         check_shape = True)

50it [00:59,  1.20s/it]
50it [01:11,  1.44s/it]
50it [01:23,  1.67s/it]
50it [01:18,  1.57s/it]
28it [00:43,  1.55s/it]

228
(228, 2, 30, 30, 30)





## Exporting Drug Activity Data

### Generating Data

In [154]:
Indole_GramPositiveActivities = [i.GP_activity for i in registered_Indoles] #Adding Gram-positive activities to list

#Truncating molecule list to remove molecules with no activity =====================================================================================================
Indole_GramPositiveActivities_trunc = []
molecule_id_trunc = []
r_Indoles = []
SMILES = []

for counter, i in enumerate(Indole_GramPositiveActivities): #Appending molecules and activities to truncated list if activity value exists
    if not pd.isna(i):
        Indole_GramPositiveActivities_trunc.append(i) #Appending GP activity
        molecule_id_trunc.append(molecule_id_l[counter]) #Appending molecule ID to list
        r_Indoles.append(registered_Indoles[counter]) #Appending Indole object to list for seperate embedding 02/09/25
        SMILES.append(Chem.MolToSmiles(registered_Indoles[counter].rdkit_mol))#Appending smiles to list

moleucle_databasesheet_trunc = [l_Indoles[molecule_id].DatabaseSheet for molecule_id in molecule_id_trunc] #Appending database sheet to list


out_df = pd.DataFrame() #Initializing output dataframe
out_df['SMILES'] = SMILES
out_df['Molecule ID'] = molecule_id_trunc #Adding molecule ID to dataframe
out_df['Origin'] = moleucle_databasesheet_trunc #Adding database sheet name as molecule origin to dataframe
out_df['Gram-Positive Activity'] = Indole_GramPositiveActivities_trunc #Adding molecule activity to dataframe

In [155]:
out_df

Unnamed: 0,SMILES,Molecule ID,Origin,Gram-Positive Activity
0,[H]/C(=N\c1c([H])c([H])c([H])c([H])c1[H])c1c([...,0,Experimental Values,1.0
1,[H]Oc1c([H])c([H])c(/N=C(\[H])c2c([H])n([H])c3...,1,Experimental Values,1.5
2,[H]/C(=N\C([H])([H])C([H])([H])C([H])([H])[H])...,2,Experimental Values,1.5
3,[H]/C(=N\c1c([H])c([H])c(OC([H])([H])[H])c([H]...,3,Experimental Values,1.0
4,[H]/C(=N\c1c([H])c([H])c(Cl)c([H])c1[H])c1c([H...,4,Experimental Values,0.5
...,...,...,...,...
186,[H]OC(=O)c1c([H])n(C2([H])C([H])([H])C2([H])[H...,223,Litrature Quinilones,2.0
187,[H]OC(=O)c1c([H])n(C2([H])C([H])([H])C2([H])[H...,224,Litrature Quinilones,2.0
188,[H]OC(=O)c1c([H])n(C2([H])C([H])([H])C2([H])[H...,225,Litrature Quinilones,2.0
189,[H]OC(=O)c1c([H])n(-c2c([H])c([H])c(F)c([H])c2...,226,Litrature Quinilones,2.0


In [158]:
with open("/home/jovyan/data/DataSetOutputs/test_dataset_graphembedding_030825/030825_SMILES_testdata.pickle", 'wb') as f:
    pickle.dump(out_df, f)