In [None]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install mordred

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.8 MB)
[K     |████████████████████████████████| 36.8 MB 511 kB/s 
Installing collected packages: rdkit
Successfully installed rdkit-2022.3.5
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[K     |████████████████████████████████| 128 kB 5.3 MB/s 
Building wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176725 sha256=36d6b05d7fcdb36bbb683c8f5ab5351a4c180e2f45496a2c2cfd5f3d753afe79
  Stored in directory: /root/.cache/pip/wheels/02/c0/2e/e7e3d63b431777712ebc128bc4deb9ac5cb19afc7c1ea341ec
Successfully built mordred
Installing collected packages: mo

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from collections import defaultdict
from tqdm import tqdm

In [None]:
#MACCS Keys Fingerprints
def maccskeys_fingerprint(mols):
  maccs_fps = [AllChem.GetMACCSKeysFingerprint(mol) for mol in mols]
  df_maccs = pd.DataFrame(np.array(maccs_fps, int))
  return df_maccs

#Mordred descriptors
def mordred_descriptor(mols, ignore_3D=True):
  from mordred import Calculator, descriptors
  calc = Calculator(descriptors, ignore_3D=True)
  df_mordred = calc.pandas(mols)
  df_mordred = df_mordred.astype(float) #Mordred descriptor contains missing values, so need to change type to float.
  return df_mordred

#RDKit descriptors
def rdkit_descriptor(mols):
  from rdkit.ML.Descriptors import MoleculeDescriptors
  from rdkit.Chem import  Descriptors
  descriptor_names = [name[0] for name in Descriptors.descList]
  descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
  desc = [descriptor_calculator.CalcDescriptors(mol) for mol in mols]
  df_RDKit = pd.DataFrame(desc, columns=descriptor_names)
  return df_RDKit

#RDKit fingerprints
def rdkit_fingerprint(mols):
  fp = [AllChem.RDKFingerprint(mol) for mol in mols]
  df_fp = pd.DataFrame(np.array(fp, int))
  return df_fp

# Mol2vec descriptors
def mol2vec(mols, radius=2):
    from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
    
    #Loading pre-trained model via word2vec
    from gensim.models import word2vec
    model = word2vec.Word2Vec.load('./drive/MyDrive/Material Informatics/Conductive polymer QSAR/model_300dim.pkl')
    sentences =[]
    for idx, mol in enumerate(tqdm(mols)):
      sentences.append(MolSentence(mol2alt_sentence(mol, radius)))
    return pd.DataFrame(sentences2vec(sentences, model, unseen='UNK'))