In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
from esol_graph.esol_data import clean_data

In [8]:
import pandas as pd
import dgl
import torch
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from pyprojroot import here
#from dgllife.utils import smiles_to_bigraph,mol_to_graph, mol_to_bigraph 

In [9]:
data = pd.read_csv(here()/"data/delaney_df_revised.csv")
cleaned_data = clean_data('data/delaney_df_revised.csv')

In [10]:
data.columns, data.shape,cleaned_data.columns, cleaned_data.shape

(Index(['Compound ID', 'ESOL predicted log solubility in mols per litre',
        'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors',
        'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area',
        'measured log solubility in mols per litre', 'smiles'],
       dtype='object'),
 (1128, 10),
 Index(['compound_id', 'esol_predicted_log_solubility_in_mols_per_litre',
        'minimum_degree', 'molecular_weight', 'number_of_h_bond_donors',
        'number_of_rings', 'number_of_rotatable_bonds', 'polar_surface_area',
        'measured_log_solubility_in_mols_per_litre', 'smiles'],
       dtype='object'),
 (814, 10))

In [7]:
data.head()
data.smiles[0]

'OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O '

In [8]:
mols = [ Chem.MolFromSmiles(data.smiles[i]) for i in range(len(data.smiles))]


In [13]:
from dgllife.utils import CanonicalAtomFeaturizer,CanonicalBondFeaturizer

In [14]:
mol1 = Chem.MolFromSmiles(data.smiles[0])
atom_featurizer = CanonicalAtomFeaturizer(atom_data_field='feat')
another_graph = mol_to_bigraph(mol = mol1, node_featurizer = atom_featurizer, edge_featurizer = CanonicalBondFeaturizer() )
another_graph

Graph(num_nodes=32, num_edges=68,
      ndata_schemes={'feat': Scheme(shape=(74,), dtype=torch.float32)}
      edata_schemes={'e': Scheme(shape=(12,), dtype=torch.float32)})

In [15]:
from typing import List, Optional

In [16]:
from dgllife.utils import CanonicalAtomFeaturizer,CanonicalBondFeaturizer
def mol2graph(smiles:str) -> dgl.graph:
    return mol_to_bigraph(mol = smiles, 
                          node_featurizer = CanonicalAtomFeaturizer(atom_data_field='feat'), 
                          edge_featurizer = CanonicalBondFeaturizer() )
    

In [17]:
from joblib import Parallel, delayed

class SmileData(dgl.data.DGLDataset):
    def __init__(self, 
                 labels:torch.Tensor, 
                 molstrs:List[str],
                 name:str, 
                 graph_fn: callable = mol2graph,
                 ncpu:int = 1):
        
        self.molstrs = molstrs
        self.labels = labels
        self.graph_fn = graph_fn
        self.ncpu = ncpu
        super().__init__(name = name)
        
        
    def process(self):
        """Turn molstr to dgl.graphs in parallele"""
        self.graphs = Parallel(n_jobs=self.ncpu)(delayed(self.graph_fn)(smile) for smile in self.molstrs)
        
        
    def __getitem__(self, idx):
        return self.graphs[idx], self.labels[idx]
    
    def __len__(self):
        
        return len(self.molstrs)
    
    
        

In [18]:
moldata = SmileData(torch.Tensor([0]*len(mols)), mols, 'smiledata')

In [19]:
moldata[3]

(Graph(num_nodes=22, num_edges=52,
       ndata_schemes={'feat': Scheme(shape=(74,), dtype=torch.float32)}
       edata_schemes={'e': Scheme(shape=(12,), dtype=torch.float32)}),
 tensor(0.))

In [20]:
moldata[4]

(Graph(num_nodes=5, num_edges=10,
       ndata_schemes={'feat': Scheme(shape=(74,), dtype=torch.float32)}
       edata_schemes={'e': Scheme(shape=(12,), dtype=torch.float32)}),
 tensor(0.))

In [None]:
feature = pd.read_csv('data/delaney_descriptors_df.csv')
feature.head()

In [None]:
process = pd.read_csv("data/delaney-processed.csv")
process.head()
process.shape

In [None]:
one_graph

In [None]:
one_graph

In [None]:
one_graph.edata['bond_type'].shape

In [None]:
def featurize_atoms(mol):
    feats = []
    for atom in mol.GetAtoms():
        feats.append(atom.GetAtomicNum())
        #transpose feature matrix, from (n_feature, n_sample) to (n_sample, n_feature)
    return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}

def featurize_bonds(mol):
    feats = []
    bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
                  Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    for bond in mol.GetBonds():
        btype = bond_types.index(bond.GetBondType())
        # One bond between atom u and v corresponds to two edges (u, v) and (v, u)
        feats.extend([btype, btype])
    return {'type': torch.tensor(feats).reshape(-1, 1).float()}

In [None]:
one_graph = mol_to_bigraph(mol = mol, node_featurizer =featurize_atoms ,
                                       edge_featurizer = featurize_bonds)

In [None]:
one_graph.ndata

In [None]:
import networkx as nx

In [None]:
nx.draw(dgl.to_networkx(one_graph))