# Import Modules

In [1]:
import pandas as pd
import rdkit
from rdkit import rdBase, Chem, DataStructs
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import AllChem, Draw, MolFromSmiles
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.AtomPairs import Pairs, Torsions
import pickle
from utils.algorithms import greedy_baseline, greedy_logdet_max
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
print(rdBase.rdkitVersion)


single_tasks = ['qm9']
model_name = '_attentivefp_'
nums = ['_1', '_2', '_3', '_4', '_5']

2020.09.1


# Make Real Property Values

In [19]:
# make df e.g. df['qm9']['_3']
df = {}
for task_name in single_tasks:
    if task_name not in df:
        df[task_name] = {}
        df[task_name + '_normalize'] = {}
    for num in nums:
        with open('./save_pickle/'+task_name+model_name+task_name+'_relu'+num+'_predicted_data.pickle', 'rb') as f:
            df[task_name][num] = pickle.load(f)
        with open('./save_pickle/'+task_name+model_name+task_name+'_normalize_relu'+num+'_predicted_data.pickle', 'rb') as f:
            df[task_name + '_normalize'][num] = pickle.load(f)

# calculate real values for qm9 properties
for task_name in df:
    for num in nums:
        for prop in df[task_name][num].columns:
            if prop == 'mols':
                break
            mean = processed[0][prop].mean()
            std = processed[0][prop].std()
            df[task_name][num]['pred ' + prop] = df[task_name][num]['pred ' + prop]*std + mean
            df[task_name][num][prop] = df[task_name][num][prop]*std + mean

# Calculate Intrinsic Values

In [5]:
from utils.calc_chem import get_natom, get_nelec

for task_name in df:
    for num in nums:
        df[task_name][num]['Nelec'] = df[task_name][num]['mols'].apply(get_nelec)
        df[task_name][num]['Natom'] = df[task_name][num]['mols'].apply(get_natom)
        df[task_name][num]['u0/Nelec'] = df[task_name][num]['u0'] / df[task_name][num]['Nelec']
        df[task_name][num]['u298/Nelec'] = df[task_name][num]['u298'] / df[task_name][num]['Nelec']
        df[task_name][num]['h298/Nelec'] = df[task_name][num]['h298'] / df[task_name][num]['Nelec']
        df[task_name][num]['g298/Nelec'] = df[task_name][num]['g298'] / df[task_name][num]['Nelec']
        df[task_name][num]['cv/3n-6'] = df[task_name][num].apply(lambda x: x['cv'] / (3 * get_natom(x['mols']) - 6), axis = 1)

# Calculate MACCS Key and ECFP & Binary Based Selection

In [9]:
# calculate maccs key and ecfp
with open('./save_pickle/qm9_atom_edge_data.pickle', 'rb') as f:
    processed = pickle.load(f)

maccs = [AllChem.GetMACCSKeysFingerprint(mol) for mol in df['qm9']['_1']['mols']]
ecfp = [AllChem.GetMorganFingerprint(mol, 2) for mol in df['qm9']['_1']['mols']]

for task_name in df:
    for num in nums:
        df[task_name][num]['maccs'] = maccs
        df[task_name][num]['ecfp'] = ecfp

In [11]:
# selection using Tanimoto Coefficients
for task_name in df:
    for num in nums:
        df[task_name][num] = greedy_baseline(df[task_name][num], 0.01, 'Tanimoto', rule='maxsum', vector='_maccs')
        df[task_name][num] = greedy_baseline(df[task_name][num], 0.01, 'Tanimoto', rule='maxmin', vector='_maccs')
        df[task_name][num] = greedy_baseline(df[task_name][num], 0.01, 'Tanimoto', rule='maxsum', vector='_ecfp')
        df[task_name][num] = greedy_baseline(df[task_name][num], 0.01, 'Tanimoto', rule='maxmin', vector='_ecfp')

_1
_2
_3
_4
_5


# Add random sampling

In [None]:
# add random ranking
for num in nums:
    select = np.random.choice(range(n_mols), n_select, replace = False)
    random_rank = np.ones(n_mols) * n_mols
    random_rank[select] = np.arange(n_select)
    random_rank = list(map(int, random_rank))
    np.random.seed(int(num[-1]))
    for task_name in df:
        df[task_name][num]['random_ranking'] = random_rank

# Save

In [13]:
import pickle
pickle.dump(df, open( "./save_pickle/result_df.p", "wb" ))

# Load

In [2]:
# if pickle is saved
import pickle 
df = pickle.load(open("./save_pickle/result_df.p", "rb"))


In [4]:
from utils.calc_chem import get_natom

In [5]:
# divide values by nelec or natom
for task_name in df:
    for num in nums:
        df[task_name][num]['zpve/3n-6'] = df[task_name][num].apply(lambda x: x['zpve'] / (3 * get_natom(x['mols']) - 6), axis = 1)

TypeError: unhashable type: 'list'

In [11]:
for task_name in df:
    for num in nums:
        df[task_name][num] = df[task_name][num].reindex(columns=['mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u298',
       'h298', 'g298', 'cv', 'mols', 'smiles', 'pred mu', 'pred alpha',
       'pred homo', 'pred lumo', 'pred gap', 'pred r2', 'pred zpve', 'pred u0',
       'pred u298', 'pred h298', 'pred g298', 'pred cv', 'embedding',
       'logdet_ranking', 'maxsum_dissim_ranking', 'maxmin_dissim_ranking',
       'Nelec', 'Natom', 'u0/Nelec', 'u298/Nelec', 'h298/Nelec', 'g298/Nelec',
       'cv/3n-6', 'zpve/3n-6', 'maccs', 'ecfp', 'maxsum_dissim_ranking_maccs',
       'maxmin_dissim_ranking_maccs', 'maxsum_dissim_ranking_ecfp',
       'maxmin_dissim_ranking_ecfp', 'random_ranking'])

In [12]:
df['qm9']['_1'].keys()

Index(['mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u298',
       'h298', 'g298', 'cv', 'mols', 'smiles', 'pred mu', 'pred alpha',
       'pred homo', 'pred lumo', 'pred gap', 'pred r2', 'pred zpve', 'pred u0',
       'pred u298', 'pred h298', 'pred g298', 'pred cv', 'embedding',
       'logdet_ranking', 'maxsum_dissim_ranking', 'maxmin_dissim_ranking',
       'Nelec', 'Natom', 'u0/Nelec', 'u298/Nelec', 'h298/Nelec', 'g298/Nelec',
       'cv/3n-6', 'zpve/3n-6', 'maccs', 'ecfp', 'maxsum_dissim_ranking_maccs',
       'maxmin_dissim_ranking_maccs', 'maxsum_dissim_ranking_ecfp',
       'maxmin_dissim_ranking_ecfp', 'random_ranking'],
      dtype='object')