In [6]:
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import (
    GetMorganFingerprintAsBitVect, GetMACCSKeysFingerprint)
from PyFingerprint.All_Fingerprint import get_fingerprint
import pandas as pd

In [2]:
df = pd.read_csv("../dataset/chem_related_to_opioid.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,InChI Key,Canonical SMILES
0,490,AAEIJYARJYLTIS-UHFFFAOYSA-N,Cc1cc(N)c2cc(NC(=O)c3ccccc3COc4ccc(CNCCCCCCCCC...
1,721,AAGKZYYLJCIWKQ-VMPREFPWSA-N,Oc1ccc2O[C@H]3CN(CCc4ccc(F)cc4)CC[C@@]3(CCCCc5...
2,822,AAHMFTBCVJDHBJ-UHFFFAOYSA-N,COc1ccc2sc(c3ccc(cc3)S(=O)(=O)C)c(C#Cc4cncn4C)...
3,1316,AAMCVRWFZKNHTH-AKSNCSMTSA-N,CC[C@@H](C)[C@H]1NC(=O)[C@H](CCCN=C(N)N)NC(=O)...
4,1317,AAMCVRWFZKNHTH-GILCVNLZSA-N,CC[C@@H](C)[C@H]1NC(=O)[C@H](CCCN=C(N)N)NC(=O)...


In [18]:
class Smiles2Fingerprint:
    
    def __init__(self, fingerprint_type=""):
        self.fp_type = fingerprint_type
        
    def __call__(self, smiles):
        if self.fp_type == "ecfp":
            return self._get_ecfp(smiles)
        elif self.fp_type == "maccs":
            return self._get_maccs(smiles)
        elif self.fp_type == "pubchem":
            return self._get_pubchem(smiles)
        else:
            raise ValueError(
                f"Fingerprint type must in ['ecfp', 'maccs', 'pubchem'], "
                f"got {self.fp_type}"
            )
    
    def _get_ecfp(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return
        fp = GetMorganFingerprintAsBitVect(mol, 4, nBits=2048)
        return fp
    
    def _get_pubchem(self, smiles):
        try:
            fp = get_fingerprint(smiles, fp_type="pubchem", output="vector")
            return fp
        except OSError:
            return None
    
    def _get_maccs(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return
        fp = GetMACCSKeysFingerprint(mol)
        return fp

In [8]:
df["ECFP"] = df["Canonical SMILES"].map(Smiles2Fingerprint("ecfp"))

In [11]:
df["maccs"] = df["Canonical SMILES"].map(Smiles2Fingerprint("maccs"))

In [19]:
df["pubchem"] = df["Canonical SMILES"].map(Smiles2Fingerprint("pubchem"))

In [23]:
df = df.rename(columns={"ECFP": "ecfp"})

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,InChI Key,Canonical SMILES,ecfp,maccs,pubchem
0,490,AAEIJYARJYLTIS-UHFFFAOYSA-N,Cc1cc(N)c2cc(NC(=O)c3ccccc3COc4ccc(CNCCCCCCCCC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
1,721,AAGKZYYLJCIWKQ-VMPREFPWSA-N,Oc1ccc2O[C@H]3CN(CCc4ccc(F)cc4)CC[C@@]3(CCCCc5...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ..."
2,822,AAHMFTBCVJDHBJ-UHFFFAOYSA-N,COc1ccc2sc(c3ccc(cc3)S(=O)(=O)C)c(C#Cc4cncn4C)...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ..."
3,1316,AAMCVRWFZKNHTH-AKSNCSMTSA-N,CC[C@@H](C)[C@H]1NC(=O)[C@H](CCCN=C(N)N)NC(=O)...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
4,1317,AAMCVRWFZKNHTH-GILCVNLZSA-N,CC[C@@H](C)[C@H]1NC(=O)[C@H](CCCN=C(N)N)NC(=O)...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."


In [25]:
df.to_csv("../dataset/chem_related_to_opioid_fingerprints.csv")