In [11]:
import torch
import os
import sys
import pandas as pd
import numpy as np

#import sacorer 
from rdkit import Chem
from rdkit.Chem import QED, Crippen, Descriptors
from rdkit import RDConfig
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer
from rdkit import RDLogger

# Suppress RDKit warnings
RDLogger.DisableLog('rdApp.*')


def calc_qed(molecules):
    qed_values = [QED.qed(mol) for mol in molecules]
    return qed_values

def calc_logp(molecules, predictor=None):
    logp_values = []
    if predictor is None:  
        logp_values = [Crippen.MolLogP(mol) for mol in molecules]

    return logp_values

def calc_mw(molecules):
    weights = [Descriptors.MolWt(mol) for mol in molecules]
    return weights

def calc_sas(molecules):
    try:
        sascores = [sascorer.calculateScore(mol) for mol in molecules]
        return sascores
    except Exception:
        return None



def calc_ic50(molecules):
    pass


# stats for gdb13 dataset

In [12]:
df = pd.read_csv('gdb13/gdb13_rand1m.smi')
smiles = df['smiles']
molecules = [Chem.MolFromSmiles(smi) for smi in smiles]

qed = calc_qed(molecules)
logp = calc_logp(molecules)
mw = calc_mw(molecules)
sas = calc_sas(molecules)
ic50 = calc_ic50(molecules)

print(f'QED: {np.mean(qed)} ± {np.std(qed)}')
print(f'LogP: {np.mean(logp)} ± {np.std(logp)}')
print(f'mWt: {np.mean(mw)} ± {np.std(mw)}')

if sas is not None:
    print(f'SAS: {np.mean(sas)} ± {np.std(sas)}')

if ic50 is not None: 
    print(f'ic50: {np.mean(ic50)} ± {np.std(ic50)}')




QED: 0.5060444260882511 ± 0.12374275713575746
LogP: 0.4718613914600001 ± 1.1103465660901581
mWt: 179.836813518 ± 8.317210586583487


# stats for moses dataset

In [None]:
df = pd.read_csv('moses/moses.smi')
smiles = df['smiles']
molecules = [Chem.MolFromSmiles(smi) for smi in smiles]

qed = calc_qed(molecules)
logp = calc_logp(molecules)
mw = calc_mw(molecules)

print(f'QED: {np.mean(qed)} ± {np.std(qed)}')
print(f'LogP: {np.mean(logp)} ± {np.std(logp)}')
print(f'mWt: {np.mean(mw)} ± {np.std(mw)}')

# stats or zinc dataset

In [None]:
df = pd.read_csv('zinc/zinc.smi')
smiles = df['smiles']
molecules = [Chem.MolFromSmiles(smi) for smi in smiles]

qed = calc_qed(molecules)
logp = calc_logp(molecules)
mw = calc_mw(molecules)

print(f'QED: {np.mean(qed)} ± {np.std(qed)}')
print(f'LogP: {np.mean(logp)} ± {np.std(logp)}')
print(f'mWt: {np.mean(mw)} ± {np.std(mw)}')