In [None]:
!pip install git+https://github.com/samoturk/mol2vec

In [None]:
from pathlib import Path
from tempfile import NamedTemporaryFile
import fileinput
import os
import rdkit
import pandas as pd
import numpy as np
import mol2vec
from mol2vec import features
from mol2vec import helpers
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsoleimport pkg_resources
pkg_resources.require("gensim==3.8.3")  
import gensim
from gensim.models import word2vec
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
import seaborn as sns
import matplotlib.pyplot as plt
import gensim.downloader
from rdkit import RDLogger   
RDLogger.DisableLog('rdApp.*') # turn off RDKit warning message 

In [None]:
'''
Uses trained Mol2Vec model to featurize list of smiles (test_smiles) into 300 dimensional vectors.

Inputs: list of smiles representing the molecules that are to be featurized
Returns: 300 dimensional feature vectors and the corresponding SMILES strings
'''
def featurizeMolecules(test_smiles):

    invalidIndices = [1]
    iterationCount = 0

    modelPath = os.path.join(os.getcwd(), 'mol2vec_model_final.pkl')
    model = word2vec.Word2Vec.load(modelPath)

    allMolList = [Chem.MolFromSmiles(smile) for smile in test_smiles]
    test_smiles = [Chem.MolToSmiles(mol) for mol in allMolList if mol!= None] 

    while len(invalidIndices) > 0:
        iterationCount += 1
        print(iterationCount)
        molNames = [Chem.MolFromSmiles(smile) for smile in test_smiles]
        sentences = []
        sentences = [features.mol2alt_sentence(mole,1) for mole in molNames]
        vectors = features.sentences2vec(sentences,model)
        invalidIndices = []
        for i in range(len(vectors)):
            if type(vectors[i]) == int:
                invalidIndices.append(i)
        print((invalidIndices))
        if len(invalidIndices) > 0:
            deleteMultiple(test_smiles, invalidIndices)
        print(len(test_smiles))
        print(invalidIndices)

    print(vectors.shape)   
    
    
    newVectors = []
    
    for i in vectors:
        i = list(i)
        newVectors.append(i)
    
    return newVectors, test_smiles



In [None]:
'''
Adds 15 extra bits to the end of the feature vectors to encode the isotopic information of each molecule

Inputs: SMILES strings and their corresponding molecular feature vectors
Returns: Feature vectors with isotopic information encoded
'''

def addIsotopologueData(clusterSmiles, vectors):
    d = []
    s34 = []
    s33 = [] 
    s36 = []
    c13 = []
    o17 = []
    o18 = []
    n15 = []
    cl37 = [] 
    sp = []
    sp2 = []
    sp3 = []
    cdList = []
    odList = []
    c1oList = []
    c2oList = []
    c3oList = []
    coTotList = []
    
    
    for smi in clusterSmiles:
        #i = clusterSmiles.index(smi)
        sp2Count = 0
        sp3Count = 0
        spCount = 0

        d.append(smi.count('2H'))
        s34.append(smi.count('34S'))
        s33.append(smi.count('33S'))
        s36.append(smi.count('36S'))
        c13.append(smi.count('13C'))
        o17.append(smi.count('17O'))
        o18.append(smi.count('18O'))
        n15.append(smi.count('15N'))
        cl37.append(smi.count('37Cl'))

        m = Chem.MolFromSmiles(smi)

        substructureCD1 = Chem.MolFromSmarts('C[2H]')
        substructureCD2 = Chem.MolFromSmarts('[13C][2H]')
        matchCD1 = (m.GetSubstructMatches(substructureCD1))
        matchCD2 = (m.GetSubstructMatches(substructureCD2))


        cdList.append(len(matchCD1) + len(matchCD2))

        substructureOD = Chem.MolFromSmarts('O[2H]')
        substructureOD17 = Chem.MolFromSmarts('[17O][2H]')
        substructureOD18 = Chem.MolFromSmarts('[18O][2H]')
        matchOD = (m.GetSubstructMatches(substructureOD))
        matchOD17 = (m.GetSubstructMatches(substructureOD17))
        matchOD18 = (m.GetSubstructMatches(substructureOD18))

        odList.append(len(matchOD) + len(matchOD17) + len(matchOD18))


        substructureO1 = Chem.MolFromSmarts('[13C]O')
        substructureO12 = Chem.MolFromSmarts('[13C][17O]')
        substructureO13 = Chem.MolFromSmarts('[13C][18O]')
        matchO1 = (m.GetSubstructMatches(substructureO1))
        matchO12 = (m.GetSubstructMatches(substructureO12))
        matchO13 = (m.GetSubstructMatches(substructureO13))

        c1oList.append(len(matchO1) + len(matchO12) + len(matchO13))

        substructureO21 =  Chem.MolFromSmarts('[13C]=O')
        substructureO22 =  Chem.MolFromSmarts('[13C]=[18O]')
        substructureO23 =  Chem.MolFromSmarts('[13C]=[17O]')
        matchO21 = (m.GetSubstructMatches(substructureO21))
        matchO22 = (m.GetSubstructMatches(substructureO22))
        matchO23 = (m.GetSubstructMatches(substructureO23))

        c2oList.append(len(matchO21) + len(matchO22) + len(matchO23))

        substructureO31 =  Chem.MolFromSmarts('[13C-]#[O+]')
        substructureO32 =  Chem.MolFromSmarts('[13C-]#[17O+]')
        substructureO33 =  Chem.MolFromSmarts('[13C-]#[18O+]')
        matchO31 = (m.GetSubstructMatches(substructureO31))
        matchO32 = (m.GetSubstructMatches(substructureO32))
        matchO33 = (m.GetSubstructMatches(substructureO33))

        c3oList.append(len(matchO31) + len(matchO32) + len(matchO33))

        coTotList.append(len(matchO1) + len(matchO12) + len(matchO13) + len(matchO21) + len(matchO22) + len(matchO23) + len(matchO31) + len(matchO32) + len(matchO33) )
        
        if smi.count('13C') != 0:
            for atom in m.GetAtoms():
                if atom.GetSymbol() == "C" and atom.GetIsotope() == 13:
                    hybrid = atom.GetHybridization()
                    hybrid = str(hybrid)
                    if hybrid == "SP2":
                        sp2Count += 1
                    if hybrid == "SP3":
                        sp3Count += 1
                    if hybrid == "SP":
                        spCount += 1

        sp.append(spCount)
        sp2.append(sp2Count)
        sp3.append(sp3Count)
        
        


    for i in range(len(vectors)):
        vectors[i].append(d[i])
        vectors[i].append(s34[i])
        vectors[i].append(s33[i])
        vectors[i].append(s36[i])
        vectors[i].append(c13[i])
        vectors[i].append(o17[i])
        vectors[i].append(o18[i])
        vectors[i].append(n15[i])
        vectors[i].append(cl37[i])
        vectors[i].append(sp[i])
        vectors[i].append(sp2[i])
        vectors[i].append(sp3[i])
        vectors[i].append(cdList[i])
        vectors[i].append(odList[i])
        vectors[i].append(coTotList[i])
        
        
    return vectors
