In [4]:
from rdkit.Chem import DataStructs,AllChem
from rdkit import Chem
import pandas as pd
from pathlib import Path

In [5]:
def ecfp4(smi):
    """
    计算1024位的ecfp4
    """
    mol = Chem.MolFromSmiles(smi)
    info = {} #注意：序号从0开始的
    desc = AllChem.GetMorganFingerprintAsBitVect(
        mol,2,nBits=1024,useChirality=True,bitInfo=info)
    return desc

def maccs(smi):
    """
    计算167位的maccs
    """
    mol = Chem.MolFromSmiles(smi)
    desc = AllChem.GetMACCSKeysFingerprint(mol)
    return desc

def maccs_df(df):
    fp = pd.DataFrame(index=range(len(df)), columns=range(167))
    for i in range(len(df)):
        fp.loc[i,:] = pd.Series(list(maccs(df.loc[i,'SMILES'])))
    df = pd.concat([df, fp.iloc[:,1:]], axis=1)
    return df

def ecfp4_df(df):
    fp = pd.DataFrame(index=range(len(df)), columns=range(1024))
    for i in range(len(df)):
        fp.loc[i,:] = pd.Series(list(ecfp4(df.loc[i,'SMILES'])))
    df = pd.concat([df, fp], axis=1)
    return df  

def fingerprint_filter(filter_txt, df):
    filter_ls = pd.read_csv(filter_txt, header=None).iloc[:,0].tolist()
    df_filter = df[filter_ls]
    return df_filter


In [6]:
## Calculate and filter fingerprints for the new data
inpath = '../data/example_data.csv'
smi_col = 'SMILES'
filter_maccs = '../data/maccs.txt'

df = pd.read_csv(inpath, usecols=[smi_col])
df.rename(columns={smi_col:'SMILES'}, inplace=True)
df_m = maccs_df(df)
df_m_f = fingerprint_filter(filter_maccs, df_m)
df_m_f.to_csv(Path(inpath).parent / (Path(inpath).stem + '_maccs_filter.csv'), index=None)

In [None]:
# filter_ecfp4 = '../data/ecfp4.txt'
# df_e = ecfp4_df(df)
# df_e_f = fingerprint_filter(filter_ecfp4, df_e)
# df_e_f.to_csv(Path(inpath).parent / (Path(inpath).stem + '_ecfp-4_filter.csv'), index=None)

# filter_ecfp4 = '../data/corina_SOM_RFE.txt'
# df_c_f = fingerprint_filter(filter_corina, df)
# df_c_f.to_csv(Path(inpath).parent / (Path(inpath).stem + '_corina_filter.csv'), index=None)