## Notebook 6 - Adding negative enzyme-substrate combinations

This notebook generates a csv listing all substrate-enzyme combinations and whether they were productive or not.

In [None]:
%run ../common.py

In [None]:
import umap
from sklearn.cluster import KMeans
from rdkit.Chem import AllChem, MolFromSmiles
from matplotlib import colormaps

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_substrates = pd.read_csv(filepath_results + 'Substrates_VB_clean.csv', encoding='utf8')

In [6]:
for i in [0.75,0.8,0.85,0.9]:

    df =  pd.read_pickle(filepath_results + f"Screening_results_CosineScore_{i}.pkl")
    assert all(df.CSMILES.isin(df_substrates.CSMILES))

    # For each enzyme, we will check whether the enzyme + name combination is in df. If not, we will append the entry to df.

    exp_ids = []
    substrates = df_substrates['Name'].unique()
    for _, row in df.iterrows():
        enzyme = row['Enzyme_name']
        name = row['Name']
        exp_id = enzyme + '_' + name
        exp_ids.append(exp_id)
        
    possible_ids = []
    for enzyme in enzymes_inclusion:
        for name in substrates:
            possible_id = enzyme + '_' + name
            possible_ids.append(possible_id)

    negative_ids = set(possible_ids) - set(exp_ids)

    enzs = []
    names = []
    for neg_id in negative_ids:
        enzyme, name = neg_id.rsplit("_",1)
        enzs.append(enzyme)
        names.append(name)

    df_negatives = pd.DataFrame(zip(enzs, names), columns = ['Enzyme_name', 'Name'])

    csmiles_map = dict(zip(df_substrates['Name'], df_substrates['CSMILES']))
    csmiles_neg = [csmiles_map[n] for n in df_negatives['Name']]

    df_negatives['CSMILES'] = csmiles_neg
    df_negatives['AUC'] = 0.

    df = df.drop_duplicates(subset=['Name', 'Enzyme_name'], keep='first')
    df['AUC'] = df.apply(lambda row: max(row['AUC_single'],row['AUC_double']), axis=1)
    df = df[['Enzyme_name', 'Name','CSMILES', 'AUC']]
    df = pd.concat([df, df_negatives])
    df['AUC_binary'] = 1*(df['AUC']>0)

    df.to_csv(filepath_results + f'/All_singlesordoubles_{i}.csv', index=False)