In [1]:
import pandas as pd
import numpy as np
import mordred
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
from dft_descriptors.prepocessing import preprocess

In [3]:
df = pd.read_csv('data_csv/Data_test11112021.csv')
df = preprocess(df)

In [4]:
smi_co = df["Reactant Smile (C-O)"].unique()
smi_ax = df["A-X effectif"].unique()
smi_lig = df["Ligand effectif"].unique()
base_add = df["Base/additif après correction effective"].unique()
base_add
unique_bases_adds = []
for smi in base_add:
    reduced_smi = set(smi.split('.'))
    for smis in reduced_smi:
        unique_bases_adds.append(smis) 
unique_bases_adds = np.unique(unique_bases_adds)
unique_bases_adds

all_mols = np.concatenate((smi_co, smi_ax, smi_lig, unique_bases_adds))

In [5]:
smi_solv = df["Solvent"].unique()
solv_df = pd.DataFrame(data=smi_solv, columns=["names"])
#solv_df.to_csv("data_csv/solvents_mordred.csv", sep=',',index=False)

In [6]:
solvs =  ["Cc1ccccc1", "CCOCC", "C1COCC1", "C1OCCOC1", 
                   "COCCOC",
                   "CC(=O)N(C)C", "OC(C)(C)C", "Cc1cc(C)ccc1", "Cc1c(C)cccc1", "Cc1ccc(C)cc1", "O=CN(C)C", 
                  "O=C1N(C)CCC1", "ClCCCl", "c1ccccc1", "CCCCCC", "CS(=O)C", "CCO", "COC1CCCC1", "OC(C)CC",
                  "CC(O)C", "CO", "OC(C)(C)CC", "CC#N", "CCCCOCCCC", "CC(C)OC(C)C", "CCOCOCC", "COC(C)(C)CC",
                  "COC(C)(C)C", "COC(C)(C)CC.CCOCC", "CCOCOCC.CCOCC", "Cc1ccccc1.O"]
solv_df["smi"] = solvs
solv_df["rdkit_smi"] = solv_df["smi"].map(Chem.CanonSmiles)
df = solv_df

In [7]:
#df = pd.DataFrame(data=all_mols, columns=["smi"])
df["rdmol"] = df["smi"].map(lambda x : Chem.MolFromSmiles(x))
df =df.dropna(subset=["rdmol"])
df["inchi"] = df["rdmol"].map(Chem.MolToInchi)
df = df.drop_duplicates(subset="inchi")

In [8]:
# Optimize Geometry

In [9]:
%%time
df['rdmolH'] = df['rdmol'].map(Chem.AddHs)
_ = df['rdmolH'].map(lambda x : AllChem.EmbedMolecule(x,useRandomCoords=True))
_ = df['rdmolH'].map(AllChem.MMFFOptimizeMolecule)

CPU times: user 375 ms, sys: 3.65 ms, total: 378 ms
Wall time: 379 ms


In [10]:
# Calculate Mordred Descriptors

In [11]:
# initialize Mordred calculators
calc=Calculator(descriptors, ignore_3D=True,)

In [12]:
%%time
# compute Mordred descriptors for all molecules (may take long)
md=calc.pandas(df['rdmol'])

100%|███████████████████████████████████████████| 31/31 [00:01<00:00, 15.59it/s]


CPU times: user 204 ms, sys: 77.2 ms, total: 281 ms
Wall time: 3.12 s


In [13]:
# replace mordred errors with NaNs
md=md.applymap(lambda x: np.nan if type(x) in [mordred.error.Missing,
                                               mordred.error.Error] else x)

In [14]:
# drop columns that have NaNs
md=md.dropna(axis=1)

In [15]:
# add smi
md.insert(0,'smi', df['smi'])

In [17]:
def duplicates(X, round=None) -> list:
    """Drop duplicate columns in dataframe

    :param X: input dataframe
    :type X: pandas.core.frame.DataFrame
    :param round: decimal point used for rounding. If set the dataframe is scaled with MinMaxScaler \
    and rounded to the given precision, afterward duplicates are removed. This allows to drop duplicates avoiding \
    floating point precision issues
    :type round: int
    :return: list of columns that are duplicates
    """

    X_ = X.copy()
    if round is not None:
        scaler = MinMaxScaler()
        X_ = pd.DataFrame(scaler.fit_transform(X_),
                          columns=X_.columns,
                          index=X_.index)
        X_ = X_.round(round)
    return X_.columns[X_.T.duplicated()].tolist()


def correlated(X, threshold=0.95) -> list:
    """Decorrelate dataframe by finding which columns shall \
    be removed to achieve correlation level below threshold

    :param X: input dataframe
    :type X: pandas.core.frame.DataFrame
    :param threshold: maximum correlation allowed in the frame X
    :type threshold: float
    :return: list of columns to prune in order to achieve desired decorrelation level
    """

    X_ = X.copy()
    N = X_.shape[1]
    corr = X_.corr().abs()

    to_drop = set()
    for i in range(N - 1):
        for j in range(i + 1, N):
            if corr.iloc[i][j] > threshold and i not in to_drop:
                to_drop.add(j)
    return [X_.columns[i] for i in to_drop]


In [18]:
def setup_md(df):
    for col in df.columns[1:]:
        df[col] = df[col].astype(float, errors = 'raise')
    
    df=df.select_dtypes(exclude='object')

    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
    
    X = df
    print (f"Number of features: {X.shape[1]}, number of samples: {X.shape[0]}")
    
    zero_std = X.std()[X.std() == 0].index
    display(f"Zero std columns: {len(zero_std)}")
    X = X[X.columns.difference(zero_std)]
    
    dupes = duplicates(X, round=3)
    display(f"Duplicate columns: {len(dupes)}")
    X = X[X.columns.difference(dupes)]
    
    high_corr = correlated(X, threshold=0.99)
    display(f"Correlated features: {len(high_corr)}")
    X = X[X.columns.difference(high_corr)]
    
    print (f"Number of features: {X.shape[1]}, number of samples: {X.shape[0]}")
    
    return X

In [19]:
md = setup_md(md)

Number of features: 923, number of samples: 31


'Zero std columns: 290'

'Duplicate columns: 43'

'Correlated features: 157'

Number of features: 433, number of samples: 31


In [21]:
md.to_csv("data_csv/solvents_mordred.csv", sep=',',index=False)