In [3]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from rdkit import Chem 

from rdkit import RDLogger
RDLogger.logger().setLevel(RDLogger.CRITICAL)

import dft_descriptors.prepocessing as pp
import dft_descriptors.featurisation as ft

In [8]:
df = pd.read_csv("../data_csv/Data_test11132021.csv", sep = ',')

In [9]:
df2 = pp.preprocess(df)

In [10]:
X, y, DOIs, mechanisms, origins = ft.process_dataframe_dft(df2, data_path="../data_csv/")

NoLigand


ValueError: could not convert string to float: '5-20'

In [15]:
df2[df2["Ligand effectif"] == "[C]1N(C23CC4CC(CC(C4)C2)C3)C=CN1C12CC3CC(CC(C3)C1)C2"]

Unnamed: 0,Index,Reactant Smile (C-O),A-X,A-X effectif,Solvent,Time,Temperature,Precurseur Nickel,Base/additifs,Base/additif après correction effective,...,Notes : qu'est ce qui ne rentres pas dans le tableau,eq CO,eq A-X,eq Ni,eq Lig (lig + prec),eq B (si reducteur pas pris en c0mpte),2 Steps,Notes bis : modifications,Unnamed: 28,Unnamed: 29


In [None]:
ligs = pd.read_csv("../data_csv/ligand_dft.csv", sep = ',', index_col=0)
ligs.index.to_list()
canon_rdkit = []
for smi in ligs.index.to_list():
    try:
        canon_rdkit.append(Chem.CanonSmiles(smi))
    except:
        canon_rdkit.append(smi)
        print(smi)
ligs["can_rdkit"] = canon_rdkit
ligs.set_index("can_rdkit", inplace=True)
ligands = [ligs.loc[ligand].to_list() for ligand in df2["Ligand effectif"]]

In [None]:
solv = pd.read_csv("../data_csv/solvents.csv", sep = ',', index_col=0)
solv.loc["toluene"].to_list()

In [None]:
AX = pd.read_csv("../data_csv/AX_dft.csv", sep = ',', index_col=0)
canon_rdkit = [Chem.CanonSmiles(smi_co) for smi_co in AX.index.to_list() ]
AX["can_rdkit"] = canon_rdkit
AX.set_index("can_rdkit", inplace=True)
AXs = [list(AX.loc[ax]) for ax in df2["A-X effectif"]]

In [None]:
substrates = [list(substrate.loc[sub]) for sub in df2["Reactant Smile (C-O)"]]

In [None]:
substrates

In [None]:
mols = []
for smi in ligs.index:
    mols.append(Chem.MolFromSmiles(smi))
Chem.Draw.MolsToGridImage(mols, subImgSize=(400, 400), maxMols = 200)

In [None]:
def process_dataframe_dft(df):
    # physico-chemical description of solvents
    solv = pd.read_csv("../data_csv/solvents.csv", sep = ',', index_col=0)
    solvents = [solv.loc[solvent].to_list() for solvent in df["Solvent"]]
    
    # dft description of ligands 
    # issue : what should we put for nan ? 
    ligs = pd.read_csv("../data_csv/ligand_dft.csv", sep = ',', index_col=0)
    ligs.index.to_list()
    canon_rdkit = []
    for smi in ligs.index.to_list():
        try:
            canon_rdkit.append(Chem.CanonSmiles(smi))
        except:
            canon_rdkit.append(smi)
            print(smi)
    ligs["can_rdkit"] = canon_rdkit
    ligs.set_index("can_rdkit", inplace=True)
    ligands = [ligs.loc[ligand].to_list() for ligand in df2["Ligand effectif"]]
    
    # dft description for suubstrates
    substrate = pd.read_csv("../data_csv/substrate_dft.csv", sep = ',', index_col=0)
    canon_rdkit = [Chem.CanonSmiles(smi_co) for smi_co in substrate.index.to_list() ]
    substrate["can_rdkit"] = canon_rdkit
    substrate.set_index("can_rdkit", inplace=True)
    substrates = [list(substrate.loc[sub]) for sub in df2["Reactant Smile (C-O)"]]
    
    # dft description for AX
    AX = pd.read_csv("../data_csv/AX_dft.csv", sep = ',', index_col=0)
    canon_rdkit = [Chem.CanonSmiles(smi_co) for smi_co in AX.index.to_list() ]
    AX["can_rdkit"] = canon_rdkit
    AX.set_index("can_rdkit", inplace=True)
    AXs = [list(AX.loc[ax]) for ax in df2["A-X effectif"]]
    
    
    precursors = one_hot_encoding(np.array([precursor_mapping(precursor) for precursor in df["Precurseur Nickel"]]).reshape(-1, 1))
    additives = one_hot_encoding(np.array([additives_mapping(precursor) for precursor in df["Base/additif après correction effective"]]).reshape(-1, 1))
    
    X = []
    yields = []
    DOIs = []
    mechanisms = []
    origins = []
    
    for i, row in df.iterrows():
        yield_isolated = process_yield(row["Isolated Yield"])
        yield_gc = process_yield(row['GC/NMR Yield'])
        # If both yields are known, we keep the isolated yield
        if yield_gc:
            y = yield_gc
        if yield_isolated:
            y = yield_isolated
        feature_vector = np.concatenate((solvents[i], ligands[i], precursors[i], additives[i], substrates[i], ))
        X.append(feature_vector)
        yields.append(y)
        DOIs.append(row["DOI"])
        mechanisms.append(row["Mechanism"])
        origins.append(origin_mapping(row["type of data (optimisation table or scope)"]))
    
    return np.array(X), np.array(yields), np.array(DOIs), np.array(mechanisms), np.array(origins)

In [None]:
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG, display
from rdkit.Chem.Draw import IPythonConsole

# def visualisation d'une réaction en fonction de son "index"
def vis_rxn(index, df):
    ax = Chem.MolFromSmiles(df[df["Index"]==index]["A-X effectif"].to_list()[0])
    ax = Chem.MolToSmarts(ax)
    try:
        lig = Chem.MolFromSmiles(pp.dict_ligand[df[df["Index"]==index]['Ligand effectif'].to_list()])
        lig = Chem.MolToSmarts(lig)
    except:
        lig = Chem.MolToSmarts(Chem.MolFromSmiles(''))

    try:
        prec = Chem.MolFromSmiles(df[df["Index"]==index]['Precurseur Nickel'].to_list()[0])
        prec = Chem.MolToSmarts(prec)
    except:
        prec = Chem.MolToSmarts(Chem.MolFromSmiles('')) 

    reactant = Chem.MolFromSmiles(df[df["Index"]==index]['Reactant Smile (C-O)'].to_list()[0])
    reactant = Chem.MolToSmarts(reactant)
    product = Chem.MolFromSmiles(df[df["Index"]==index]['Product'].to_list()[0])
    product = Chem.MolToSmarts(product)
    reaction = reactant + '.' + ax + '>' + lig + prec + '>' + product

    # Pictet-Spengler rxn
    rxn = AllChem.ReactionFromSmarts(reaction)
    return rxn

def rxn_byDOI(DOI, df):
    df = df[df["DOI"] == DOI]
    indexes = df["Index"].to_list()
    for index in indexes:
        rxn = vis_rxn(index, df)
        Y = []
        Y.append(df[df["Index"] == index]["GC/NMR Yield"].to_list())
        Y.append(df[df["Index"] == index]["Isolated Yield"].to_list())
        d = Draw.MolDraw2DSVG(900, 300)
        d.DrawReaction(rxn)
        d.FinishDrawing()  
        svg = d.GetDrawingText()
        svg2 = svg.replace('svg:','')
        svg3 = SVG(svg2)
        display(svg3)
        print(index, Y)
        

def rxn_unique_byDOI(DOI, df):
    df = df[df["DOI"] == DOI]
    df = df[df.duplicated(subset= ['Reactant Smile (C-O)', 'A-X effectif', 'Product'],  keep = 'first') == False]
    indexes = df["Index"].to_list()
    for index in indexes:
        rxn = vis_rxn(index, df)
        Y = []
        Y.append(df[df["Index"] == index]["GC/NMR Yield"].to_list())
        Y.append(df[df["Index"] == index]["Isolated Yield"].to_list())
        d = Draw.MolDraw2DSVG(900, 300)
        d.DrawReaction(rxn)
        d.FinishDrawing()  
        svg = d.GetDrawingText()
        svg2 = svg.replace('svg:','')
        svg3 = SVG(svg2)
        display(svg3)
        print(index, Y)

In [None]:
dois = np.unique(df2["DOI"])
len(dois)

In [None]:
def remove_double_couplings(df):
    double_couplings = []
    for idx in range(len(df)):
        try:
            r = Chem.MolFromSmiles(df.iloc[idx]["Reactant Smile (C-O)"])
            p = Chem.MolFromSmiles(df.iloc[idx]["Product"])
            r = Chem.rdmolops.AddHs(r)
            p = Chem.rdmolops.AddHs(p)
            mols = [r, p]
            res = rdFMCS.FindMCS(mols) 
            patt = Chem.MolFromSmarts(res.smartsString)
            rm = AllChem.DeleteSubstructs(r,patt)
            smi = Chem.MolToSmiles(rm)
            if '.' in smi:  
                double_couplings.append((idx, Chem.MolToSmiles(r), Chem.MolToSmiles(p)))
        except:
            print(df.iloc[idx].to_list())
            
    return double_couplings