In [2]:
import pandas as pd
import numpy as np
from conformation_encode.scaffold_split import scaffold_split  
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold 
def search_elements_in_dataframe(df, smiles_column, element_symbol):
    found_indices = []
    for index, row in df.iterrows():
        smiles = row[smiles_column]
        mol = Chem.MolFromSmiles(smiles)
        if mol and any(atom.GetSymbol() == element_symbol for atom in mol.GetAtoms()):
            found_indices.append(index)
    return found_indices

data_cdr1 = pd.read_csv("./data/Official/cdr1_binary_0902.csv")
se_indices = search_elements_in_dataframe(data_cdr1, "Canonicalsmiles", "Se")
te_indices = search_elements_in_dataframe(data_cdr1, "Canonicalsmiles", "Te")
error_index = se_indices+te_indices 
data_cdr1 = data_cdr1.drop(error_index, axis = 0).reset_index(drop=True)

## **Loading data**

In [3]:
rdk7_path = "/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/Mol_Featurizer/Cdr1_featurized_scaffold/RDK7.csv"
rdk5_path = "/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/Mol_Featurizer/Cdr1_featurized_scaffold/RDK5.csv"
rdk6_path = "/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/Mol_Featurizer/Cdr1_featurized_scaffold/RDK6.csv"
avalon_path = "/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/Mol_Featurizer/Cdr1_featurized_scaffold/Avalon.csv"
mordred_path = "/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/Mol_Featurizer/Cdr1_featurized_scaffold/Mordred_preprocess.csv"
ph4_path = "/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/Mol_Featurizer/Cdr1_featurized_scaffold/Ph4_gobbi.csv"
rdk7 = pd.read_csv(rdk7_path)
rdk5 = pd.read_csv(rdk5_path)
rdk6 = pd.read_csv(rdk6_path)
#mordred = pd.read_csv(mordred_path_fix)
mordred = pd.read_csv(mordred_path)
ph4 = pd.read_csv(ph4_path)
avalon = pd.read_csv(avalon_path)
rdk7.head()

  rdk7 = pd.read_csv(rdk7_path)
  rdk5 = pd.read_csv(rdk5_path)
  rdk6 = pd.read_csv(rdk6_path)
  ph4 = pd.read_csv(ph4_path)
  avalon = pd.read_csv(avalon_path)


Unnamed: 0,ID,Standardize_smile,Activity,0,1,2,3,4,5,6,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,Isonitrile_1,[C-]#[N+]C1=CC(=CCC(=O)O)CC1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,WK14,Oc1ccccc1/C=C/c1ccc2cccc(O)c2n1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,WK14B,CC(=O)Oc1ccccc1/C=C/c1ccc2cccc(OC(C)=O)c2n1,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,WK15,Oc1cccc(/C=C/c2ccc3cccc(O)c3n2)c1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,WK15B,CC(=O)Oc1cccc(/C=C/c2ccc3cccc(O)c3n2)c1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## **Drop low variance column**

In [4]:
from sklearn.feature_selection import VarianceThreshold
def remove_low_variance(df, threshold):
    id_smile_df = df[["ID", "Standardize_smile"]]
    df = df.drop(["ID", "Standardize_smile"], axis = 1)
    selector = VarianceThreshold(threshold)
    selector.fit(df)
    feature = selector.get_support(indices = False)
    feature[0]=True
    df_drop = df.iloc[:, feature]
    df_drop = pd.concat([id_smile_df, df_drop], axis = 1)
    return df_drop

avalon_remove_variance = remove_low_variance(avalon, 0.0)
ph4_remove_variance = remove_low_variance(ph4, 0.0)

## **Active/Inactive dataframe**

In [5]:
rdk5_active = rdk5[rdk5["Activity"]==1].reset_index(drop=True)
rdk5_inactive = rdk5[rdk5["Activity"]==0].reset_index(drop=True)
rdk6_active = rdk6[rdk6["Activity"]==1].reset_index(drop=True)
rdk6_inactive = rdk6[rdk6["Activity"]==0].reset_index(drop=True)
rdk7_active = rdk7[rdk7["Activity"]==1].reset_index(drop=True)
rdk7_inactive = rdk7[rdk7["Activity"]==0].reset_index(drop=True)
mordred_active = mordred[mordred["Activity"]==1].reset_index(drop=True)
mordred_inactive = mordred[mordred["Activity"]==0].reset_index(drop=True)
ph4_active = ph4_remove_variance[ph4_remove_variance["Activity"]==1].reset_index(drop=True)
ph4_inactive = ph4_remove_variance[ph4_remove_variance["Activity"]==0].reset_index(drop=True)
avalon_active = avalon_remove_variance[avalon_remove_variance["Activity"]==1].reset_index(drop=True)
avalon_inactive = avalon_remove_variance[avalon_remove_variance["Activity"]==0].reset_index(drop=True)

## **Check outliers**

### **Check outlier inactive**

In [6]:
from sklearn.neighbors import LocalOutlierFactor
from numpy import where, random
def remove_outlier(df):
    X = df.drop(['Activity',"ID","Standardize_smile"], axis=1)
    y = df['Activity'].values
    lof = LocalOutlierFactor(n_neighbors=20)
    y_pred = lof.fit_predict(X)
    lofs_index = where(y_pred==-1)
    print(f"The number of outlier is {len(lofs_index[0])}")
    return lofs_index[0]

In [7]:
rdk7_outlier_inacitve = remove_outlier(rdk7_inactive)
rdk5_outlier_inactive = remove_outlier(rdk5_inactive)
rdk6_outlier_inactive = remove_outlier(rdk6_inactive)
mordred_outlier_inactive = remove_outlier(mordred_inactive)
ph4_outlier_inactive = remove_outlier(ph4_inactive)
avalon_outlier_inactive = remove_outlier(avalon_inactive)

The number of outlier is 37
The number of outlier is 8
The number of outlier is 0
The number of outlier is 56
The number of outlier is 1086
The number of outlier is 1


In [8]:
#Check unique indices
index_outlier_rdk5_inactive = {index for index in rdk5_outlier_inactive}
index_outlier_rdk6_inactive = {index for index in rdk6_outlier_inactive}
index_outlier_rdk7_inactive = {index for index in rdk7_outlier_inacitve}
index_outlier_mordred_inactive = {index for index in mordred_outlier_inactive}
index_outlier_ph4_inactive = {index for index in ph4_outlier_inactive}
index_outlier_avalon_inactive = {index for index in avalon_outlier_inactive}
unique_indices_inactive = index_outlier_rdk5_inactive.union(index_outlier_rdk6_inactive, index_outlier_rdk7_inactive, index_outlier_mordred_inactive, 
                                                            index_outlier_ph4_inactive, index_outlier_avalon_inactive)
print(f"The number of unique outlier inactive is {len(unique_indices_inactive)}")


The number of unique outlier inactive is 1108


In [9]:
#Outliers inactive
rdk7_outlier_inactive_df = rdk7_inactive.iloc[list(unique_indices_inactive)].reset_index(drop=True)
rdk5_outlier_inactive_df = rdk5_inactive.iloc[list(unique_indices_inactive)].reset_index(drop=True)
rdk6_outlier_inactive_df = rdk6_inactive.iloc[list(unique_indices_inactive)].reset_index(drop=True)
mordred_outlier_inactive_df = mordred_inactive.iloc[list(unique_indices_inactive)].reset_index(drop=True)
ph4_outlier_inactive_df = ph4_inactive.iloc[list(unique_indices_inactive)].reset_index(drop=True)
avalon_outlier_inactive_df = avalon_inactive.iloc[list(unique_indices_inactive)].reset_index(drop=True)

In [506]:
#Saving data inactive outliers
rdk5_outlier_inactive_df.to_csv("/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/data/Official/Featurizer_data/rdk5_hard_test.csv", index=False)
rdk6_outlier_inactive_df.to_csv("/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/data/Official/Featurizer_data/rdk6_hard_test.csv", index=False)
rdk7_outlier_inactive_df.to_csv("/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/data/Official/Featurizer_data/rdk7_hard_test.csv", index=False)
mordred_outlier_inactive_df.to_csv("/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/data/Official/Featurizer_data/mordred_hard_test.csv", index=False)
ph4_outlier_inactive_df.to_csv("/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/data/Official/Featurizer_data/ph4_hard_test.csv", index=False)
avalon_outlier_inactive_df.to_csv("/Users/thechuongtrinh/Documents/Workspace/Master_thesis/Cdr1/data/Official/Featurizer_data/avalon_hard_test.csv", index=False)

In [10]:
#Normal inactive
rdk7_inactive_df = rdk7_inactive.drop(list(unique_indices_inactive), axis = 0).reset_index(drop=True)
rdk5_inactive_df = rdk5_inactive.drop(list(unique_indices_inactive), axis = 0).reset_index(drop=True)
rdk6_inactive_df = rdk6_inactive.drop(list(unique_indices_inactive), axis = 0).reset_index(drop=True)
mordred_inactive_df = mordred_inactive.drop(list(unique_indices_inactive), axis = 0).reset_index(drop=True)
ph4_inactive_df = ph4_inactive.drop(list(unique_indices_inactive), axis = 0).reset_index(drop=True)
avalon_inactive_df = avalon_inactive.drop(list(unique_indices_inactive), axis = 0).reset_index(drop=True)

### **Check outlier active**

In [11]:
rdk7_outlier_acitve = remove_outlier(rdk7_active)
rdk5_outlier_active = remove_outlier(rdk5_active)
rdk6_outlier_active = remove_outlier(rdk6_active)
mordred_outlier_active = remove_outlier(mordred_active)
ph4_outlier_active = remove_outlier(ph4_active)
avalon_outlier_active = remove_outlier(avalon_active)

The number of outlier is 0
The number of outlier is 0
The number of outlier is 0
The number of outlier is 4
The number of outlier is 34
The number of outlier is 0


In [12]:
#Check unique indices
index_outlier_rdk5_active = {index for index in rdk5_outlier_active}
index_outlier_rdk6_active = {index for index in rdk6_outlier_active}
index_outlier_rdk7_active = {index for index in rdk7_outlier_acitve}
index_outlier_mordred_active = {index for index in mordred_outlier_active}
index_outlier_ph4_active = {index for index in ph4_outlier_active}
index_outlier_avalon_active = {index for index in avalon_outlier_active}
unique_indices_active = index_outlier_rdk5_active.union(index_outlier_rdk6_active, index_outlier_rdk7_active, index_outlier_mordred_active, 
                                                            index_outlier_ph4_active, index_outlier_avalon_active)
print(f"The number of unique outlier active is {len(unique_indices_active)}")

The number of unique outlier active is 36


In [13]:
#Outliers active
rdk7_outlier_active_df = rdk7_active.iloc[list(unique_indices_active)].reset_index(drop=True)
rdk5_outlier_active_df = rdk5_active.iloc[list(unique_indices_active)].reset_index(drop=True)
rdk6_outlier_active_df = rdk6_active.iloc[list(unique_indices_active)].reset_index(drop=True)
mordred_outlier_active_df = mordred_active.iloc[list(unique_indices_active)].reset_index(drop=True)
ph4_outlier_active_df = ph4_active.iloc[list(unique_indices_active)].reset_index(drop=True)
avalon_outlier_active_df = avalon_active.iloc[list(unique_indices_active)].reset_index(drop=True)

In [14]:
#Normal active
rdk7_active_df = rdk7_active.drop(list(unique_indices_active), axis = 0).reset_index(drop=True)
rdk5_active_df = rdk5_active.drop(list(unique_indices_active), axis = 0).reset_index(drop=True)
rdk6_active_df = rdk6_active.drop(list(unique_indices_active), axis = 0).reset_index(drop=True)
mordred_active_df = mordred_active.drop(list(unique_indices_active), axis = 0).reset_index(drop=True)
ph4_active_df = ph4_active.drop(list(unique_indices_active), axis = 0).reset_index(drop=True)
avalon_active_df = avalon_active.drop(list(unique_indices_active), axis = 0).reset_index(drop=True)

In [15]:
rdk7_active_df.to_csv("rdk7_normal_active.csv", index = False)
rdk7_outlier_active_df.to_csv("rdk7_outliers_active.csv", index= False)
rdk7_inactive_df.to_csv("rdk7_inactive_normal.csv", index = False)

## **Splitting data**

In [16]:
#Spliting data
def splitting_data(data, test_size, valid_size ,seed):
    data_train, data_test = scaffold_split(data, smiles_col = "Standardize_smile", test_size = test_size,random_state = seed)
    data_train = data_train.reset_index(drop=True)
    data_test = data_test.reset_index(drop=True)
    data_train, data_valid = scaffold_split(data_train, smiles_col = "Standardize_smile", test_size = valid_size,random_state = seed)
    data_train = data_train.reset_index(drop=True)
    data_valid = data_valid.reset_index(drop=True)
    return data_train, data_valid, data_test

train_active_rdk7, valid_active_rdk7, test_active_rdk7 = splitting_data(rdk7_active_df, 0.341, 0.323, 42)
train_inactive_rdk7, valid_inactive_rdk7, test_inactive_rdk7 = splitting_data(rdk7_inactive_df, 0.252, 0.2, 42)
train_rdk7 = pd.concat([train_active_rdk7, train_inactive_rdk7, rdk7_outlier_active_df], axis = 0).reset_index(drop=True)
test_rdk7 = pd.concat([test_active_rdk7, test_inactive_rdk7], axis = 0).reset_index(drop=True)
valid_rdk7 = pd.concat([valid_active_rdk7, valid_inactive_rdk7], axis = 0).reset_index(drop=True)

train_active_rdk5, valid_active_rdk5, test_active_rdk5 = splitting_data(rdk5_active_df, 0.341, 0.323, 42)
train_inactive_rdk5, valid_inactive_rdk5, test_inactive_rdk5 = splitting_data(rdk5_inactive_df, 0.252, 0.2, 42)
train_rdk5 = pd.concat([train_active_rdk5, train_inactive_rdk5, rdk5_outlier_active_df], axis = 0).reset_index(drop=True)
test_rdk5 = pd.concat([test_active_rdk5, test_inactive_rdk5], axis = 0).reset_index(drop=True)
valid_rdk5 = pd.concat([valid_active_rdk5, valid_inactive_rdk5], axis = 0).reset_index(drop=True)

train_active_rdk6, valid_active_rdk6, test_active_rdk6 = splitting_data(rdk6_active_df, 0.341, 0.323, 42)
train_inactive_rdk6, valid_inactive_rdk6, test_inactive_rdk6 = splitting_data(rdk6_inactive_df, 0.252, 0.2, 42)
train_rdk6 = pd.concat([train_active_rdk6, train_inactive_rdk6, rdk6_outlier_active_df], axis = 0).reset_index(drop=True)
test_rdk6 = pd.concat([test_active_rdk6, test_inactive_rdk6], axis = 0).reset_index(drop=True)
valid_rdk6 = pd.concat([valid_active_rdk6, valid_inactive_rdk6], axis = 0).reset_index(drop=True)

train_active_mordred, valid_active_mordred, test_active_mordred = splitting_data(mordred_active_df, 0.341, 0.323, 42)
train_inactive_mordred, valid_inactive_mordred, test_inactive_mordred = splitting_data(mordred_inactive_df, 0.252, 0.2, 42)
train_mordred = pd.concat([train_active_mordred, train_inactive_mordred, mordred_outlier_active_df], axis = 0).reset_index(drop=True)
test_mordred = pd.concat([test_active_mordred, test_inactive_mordred], axis = 0).reset_index(drop=True)
valid_mordred = pd.concat([valid_active_mordred, valid_inactive_mordred], axis = 0).reset_index(drop=True)

train_active_ph4, valid_active_ph4, test_active_ph4 = splitting_data(ph4_active_df, 0.341, 0.323, 42)
train_inactive_ph4, valid_inactive_ph4, test_inactive_ph4 = splitting_data(ph4_inactive_df, 0.252, 0.2, 42)
train_ph4 = pd.concat([train_active_ph4, train_inactive_ph4, ph4_outlier_active_df], axis = 0).reset_index(drop=True)
test_ph4 = pd.concat([test_active_ph4, test_inactive_ph4], axis = 0).reset_index(drop=True)
valid_ph4 = pd.concat([valid_active_ph4, valid_inactive_ph4], axis = 0).reset_index(drop=True)

train_active_avalon, valid_active_avalon, test_active_avalon = splitting_data(avalon_active_df, 0.341, 0.323, 42)
train_inactive_avalon, valid_inactive_avalon, test_inactive_avalon = splitting_data(avalon_inactive_df, 0.252, 0.2, 42)
train_avalon = pd.concat([train_active_avalon, train_inactive_avalon, avalon_outlier_active_df], axis = 0).reset_index(drop=True)
test_avalon = pd.concat([test_active_avalon, test_inactive_avalon], axis = 0).reset_index(drop=True)
valid_avalon = pd.concat([valid_active_avalon, valid_inactive_avalon], axis = 0).reset_index(drop=True)

#### **Splitting  active outlier**

In [17]:
rdk7_active = pd.concat([train_active_rdk7,rdk7_outlier_active_df], axis = 0).reset_index(drop=True)
data = rdk7_active
scaffolds = {}
for idx, row in data.iterrows():
    smiles = row["Standardize_smile"]
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False)
    if scaffold not in scaffolds:
        scaffolds[scaffold] = [idx]
    elif scaffold in scaffolds and idx >= 42:
        print(f"Index {idx} is duplicated")
        scaffolds[scaffold].append(idx)
    else:
        scaffolds[scaffold].append(idx)
scaffold_lists = list(scaffolds.values()) #Scaffolds in active outliers are different scaffolds in normal active data.

Index 49 is duplicated
Index 53 is duplicated
Index 57 is duplicated
Index 63 is duplicated
Index 64 is duplicated
Index 68 is duplicated
Index 69 is duplicated
Index 71 is duplicated
Index 74 is duplicated
Index 77 is duplicated


In [18]:
def spliting_folds_active(data, seed):
    fold_1_2_3, fold_4 = scaffold_split(data, smiles_col = "Standardize_smile", test_size = 0.25, random_state = seed)
    fold_1_2_3 = fold_1_2_3.reset_index(drop=True)
    fold_4 = fold_4.reset_index(drop=True)
    fold_1_2, fold_3 = scaffold_split(fold_1_2_3, smiles_col = "Standardize_smile", test_size = 0.33, random_state = seed)
    fold_1_2 = fold_1_2.reset_index(drop=True)
    fold_3 = fold_3.reset_index(drop=True)
    fold_1, fold_2 = scaffold_split(fold_1_2, smiles_col = "Standardize_smile", test_size = 0.5, random_state = seed)
    fold_1 = fold_1.reset_index(drop=True)
    fold_2 = fold_2.reset_index(drop=True)
    return fold_1, fold_2, fold_3, fold_4

In [19]:
def spliting_folds_inactive(data, seed):
    fold_1_2_3, fold_4 = scaffold_split(data, smiles_col = "Standardize_smile", test_size = 0.25, random_state = seed)
    fold_1_2_3 = fold_1_2_3.reset_index(drop=True)
    fold_4 = fold_4.reset_index(drop=True)
    fold_1_2, fold_3 = scaffold_split(fold_1_2_3, smiles_col = "Standardize_smile", test_size = 0.333, random_state = seed)
    fold_1_2 = fold_1_2.reset_index(drop=True)
    fold_3 = fold_3.reset_index(drop=True)
    fold_1, fold_2 = scaffold_split(fold_1_2, smiles_col = "Standardize_smile", test_size = 0.5, random_state = seed)
    fold_1 = fold_1.reset_index(drop=True)
    fold_2 = fold_2.reset_index(drop=True)
    return fold_1, fold_2, fold_3, fold_4

In [20]:
fold_1_active_rdk7, fold_2_active_rdk7, fold_3_active_rdk7, fold_4_active_rdk7 = spliting_folds_active(rdk7_active, 42)
print(f"Active fold 1: {fold_1_active_rdk7.shape[0]}")
print(f"Active fold 2: {fold_2_active_rdk7.shape[0]}")
print(f"Active fold 3: {fold_3_active_rdk7.shape[0]}")
print(f"Active fold 4: {fold_4_active_rdk7.shape[0]}")
fold_1_inactive_rdk7, fold_2_inactive_rdk7, fold_3_inactive_rdk7, fold_4_inactive_rdk7 = spliting_folds_inactive(train_inactive_rdk7, 42)
print(f"Inactive fold 1: {fold_1_inactive_rdk7.shape[0]}")
print(f"Inactive fold 2: {fold_2_inactive_rdk7.shape[0]}")
print(f"Inactive fold 3: {fold_3_inactive_rdk7.shape[0]}")
print(f"Inactive fold 4: {fold_4_inactive_rdk7.shape[0]}")
fold_1_rdk7 = pd.concat([fold_1_active_rdk7, fold_1_inactive_rdk7], axis = 0).reset_index(drop=True)
fold_2_rdk7 = pd.concat([fold_2_active_rdk7, fold_2_inactive_rdk7], axis = 0).reset_index(drop=True)
fold_3_rdk7 = pd.concat([fold_3_active_rdk7, fold_3_inactive_rdk7], axis = 0).reset_index(drop=True)
fold_4_rdk7 = pd.concat([fold_4_active_rdk7, fold_4_inactive_rdk7], axis = 0).reset_index(drop=True)

Active fold 1: 20
Active fold 2: 20
Active fold 3: 19
Active fold 4: 19
Inactive fold 1: 87
Inactive fold 2: 86
Inactive fold 3: 86
Inactive fold 4: 86


In [463]:
import os
fold_1_path = "./data/Official/Featurizer_data/fold_1/"
os.makedirs(fold_1_path, exist_ok=True)
fold_2_path = "./data/Official/Featurizer_data/fold_2/"
os.makedirs(fold_2_path, exist_ok=True)
fold_3_path = "./data/Official/Featurizer_data/fold_3/"
os.makedirs(fold_3_path, exist_ok=True)
fold_4_path = "./data/Official/Featurizer_data/fold_4/"
os.makedirs(fold_4_path, exist_ok=True)

fold_1_rdk7.to_csv(f"{fold_1_path}rdk7.csv", index=False)
fold_2_rdk7.to_csv(f"{fold_2_path}rdk7.csv", index=False)
fold_3_rdk7.to_csv(f"{fold_3_path}rdk7.csv", index=False)
fold_4_rdk7.to_csv(f"{fold_4_path}rdk7.csv", index=False)

In [23]:
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
def create_scaffold_df(data):
    scaffolds = {}
    for idx, row in data.iterrows():
        smiles = row["Standardize_smile"]
        mol = Chem.MolFromSmiles(smiles)
        scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False)
        if scaffold not in scaffolds:
            scaffolds[scaffold] = [idx]
        else:
            scaffolds[scaffold].append(idx)
    scaffold_clean = [x for x in list(scaffolds.keys()) if x!= '']
    df_dict = {"ID": [], "Scaffold": scaffold_clean, "Activity": []}
    df_dict["Activity"] = [1 if any(data.loc[scaffolds[scaffold], "Activity"]==1) else 0 for scaffold in scaffold_clean]
    df_dict["ID"] = [x for x in range(len(scaffold_clean))]
    df_scaffold = pd.DataFrame(df_dict)
    return df_scaffold

In [25]:
#Fold 1
active_outliers_scaffold = list(create_scaffold_df(rdk7_outlier_active_df)["Scaffold"].values)
fold_1_active_scaffold = create_scaffold_df(fold_1_active_rdk7)
result = fold_1_active_scaffold['Scaffold'].isin(active_outliers_scaffold)
if any(result):
    print(f"There are scaffolds in fold 1 active are also in active outliers {result.sum()}")
else: 
    print("There are no scaffolds in fold 1 active are also in active outliers")
#Fold 2
fold_2_active_scaffold = create_scaffold_df(fold_2_active_rdk7)
result = fold_2_active_scaffold['Scaffold'].isin(active_outliers_scaffold)
if any(result):
    print(f"There are scaffolds in fold 2 active are also in active outliers {result.sum()}")
else:
    print("There are no scaffolds in fold 2 active are also in active outliers")
#Fold 3
fold_3_active_scaffold = create_scaffold_df(fold_3_active_rdk7)
result = fold_3_active_scaffold['Scaffold'].isin(active_outliers_scaffold)
if any(result):
    print(f"There are scaffolds in fold 3 active are also in active outliers {result.sum()}")
else:
    print("There are no scaffolds in fold 3 active are also in active outliers")
#Fold 4
fold_4_active_scaffold = create_scaffold_df(fold_4_active_rdk7)
result = fold_4_active_scaffold['Scaffold'].isin(active_outliers_scaffold)
if any(result):
    print(f"There are scaffolds in fold 4 active are also in active outliers {result.sum()}")
else:
    print("There are no scaffolds in fold 4 active are also in active outliers")
#Fold 5:
fold_5_active_scaffold = create_scaffold_df(valid_rdk7)
result = fold_5_active_scaffold['Scaffold'].isin(active_outliers_scaffold)
if any(result):
    print(f"There are scaffolds in fold 5 active are also in active outliers {result.sum()}")
else:
    print("There are no scaffolds in fold 5 active are also in active outliers")

There are scaffolds in fold 1 active are also in active outliers 8
There are scaffolds in fold 2 active are also in active outliers 3
There are scaffolds in fold 3 active are also in active outliers 11
There are scaffolds in fold 4 active are also in active outliers 6
There are no scaffolds in fold 5 active are also in active outliers


In [428]:
rdk7_train = pd.concat([fold_1_rdk7, fold_2_rdk7, fold_3_rdk7, fold_4_rdk7], axis = 0).reset_index(drop=True)
rdk7_test = test_rdk7
rdk7_valid = valid_rdk7
rdk7_train.to_csv("./data/Official/Featurizer_data/rdk7_train.csv", index = False)
rdk7_test.to_csv("./data/Official/Featurizer_data/rdk7_test.csv", index = False)
rdk7_valid.to_csv("./data/Official/Featurizer_data/rdk7_valid.csv", index = False)

In [464]:
rdk5_active = pd.concat([train_active_rdk5, rdk5_outlier_active_df], axis = 0).reset_index(drop=True)
rdk6_active = pd.concat([train_active_rdk6, rdk6_outlier_active_df], axis = 0).reset_index(drop=True)
mordred_active = pd.concat([train_active_mordred, mordred_outlier_active_df], axis = 0).reset_index(drop=True)
ph4_active = pd.concat([train_active_ph4, ph4_outlier_active_df], axis = 0).reset_index(drop=True)
avalon_active = pd.concat([train_active_avalon, avalon_outlier_active_df], axis = 0).reset_index(drop=True)

fold_1_active_rdk5, fold_2_active_rdk5, fold_3_active_rdk5, fold_4_active_rdk5 = spliting_folds_active(rdk5_active, 42)
fold_1_active_rdk6, fold_2_active_rdk6, fold_3_active_rdk6, fold_4_active_rdk6 = spliting_folds_active(rdk6_active, 42)
fold_1_active_mordred, fold_2_active_mordred, fold_3_active_mordred, fold_4_active_mordred = spliting_folds_active(mordred_active, 42)
fold_1_active_ph4, fold_2_active_ph4, fold_3_active_ph4, fold_4_active_ph4 = spliting_folds_active(ph4_active, 42)
fold_1_active_avalon, fold_2_active_avalon, fold_3_active_avalon, fold_4_active_avalon = spliting_folds_active(avalon_active, 42)
fold_1_active_rdk7, fold_2_active_rdk7, fold_3_active_rdk7, fold_4_active_rdk7 = spliting_folds_active(rdk7_active, 42)

fold_1_inactive_rdk5, fold_2_inactive_rdk5, fold_3_inactive_rdk5, fold_4_inactive_rdk5 = spliting_folds_inactive(train_inactive_rdk5, 42)
fold_1_inactive_rdk6, fold_2_inactive_rdk6, fold_3_inactive_rdk6, fold_4_inactive_rdk6 = spliting_folds_inactive(train_inactive_rdk6, 42)
fold_1_inactive_mordred, fold_2_inactive_mordred, fold_3_inactive_mordred, fold_4_inactive_mordred = spliting_folds_inactive(train_inactive_mordred, 42)
fold_1_inactive_ph4, fold_2_inactive_ph4, fold_3_inactive_ph4, fold_4_inactive_ph4 = spliting_folds_inactive(train_inactive_ph4, 42)
fold_1_inactive_avalon, fold_2_inactive_avalon, fold_3_inactive_avalon, fold_4_inactive_avalon = spliting_folds_inactive(train_inactive_avalon, 42)

fold_1_rdk5 = pd.concat([fold_1_active_rdk5, fold_1_inactive_rdk5], axis = 0).reset_index(drop=True)
fold_2_rdk5 = pd.concat([fold_2_active_rdk5, fold_2_inactive_rdk5], axis = 0).reset_index(drop=True)
fold_3_rdk5 = pd.concat([fold_3_active_rdk5, fold_3_inactive_rdk5], axis = 0).reset_index(drop=True)
fold_4_rdk5 = pd.concat([fold_4_active_rdk5, fold_4_inactive_rdk5], axis = 0).reset_index(drop=True)

fold_1_rdk6 = pd.concat([fold_1_active_rdk6, fold_1_inactive_rdk6], axis = 0).reset_index(drop=True)
fold_2_rdk6 = pd.concat([fold_2_active_rdk6, fold_2_inactive_rdk6], axis = 0).reset_index(drop=True)
fold_3_rdk6 = pd.concat([fold_3_active_rdk6, fold_3_inactive_rdk6], axis = 0).reset_index(drop=True)
fold_4_rdk6 = pd.concat([fold_4_active_rdk6, fold_4_inactive_rdk6], axis = 0).reset_index(drop=True)

fold_1_mordred = pd.concat([fold_1_active_mordred, fold_1_inactive_mordred], axis = 0).reset_index(drop=True)
fold_2_mordred = pd.concat([fold_2_active_mordred, fold_2_inactive_mordred], axis = 0).reset_index(drop=True)
fold_3_mordred = pd.concat([fold_3_active_mordred, fold_3_inactive_mordred], axis = 0).reset_index(drop=True)
fold_4_mordred = pd.concat([fold_4_active_mordred, fold_4_inactive_mordred], axis = 0).reset_index(drop=True)

fold_1_ph4 = pd.concat([fold_1_active_ph4, fold_1_inactive_ph4], axis = 0).reset_index(drop=True)
fold_2_ph4 = pd.concat([fold_2_active_ph4, fold_2_inactive_ph4], axis = 0).reset_index(drop=True)
fold_3_ph4 = pd.concat([fold_3_active_ph4, fold_3_inactive_ph4], axis = 0).reset_index(drop=True)
fold_4_ph4 = pd.concat([fold_4_active_ph4, fold_4_inactive_ph4], axis = 0).reset_index(drop=True)

fold_1_avalon = pd.concat([fold_1_active_avalon, fold_1_inactive_avalon], axis = 0).reset_index(drop=True)
fold_2_avalon = pd.concat([fold_2_active_avalon, fold_2_inactive_avalon], axis = 0).reset_index(drop=True)
fold_3_avalon = pd.concat([fold_3_active_avalon, fold_3_inactive_avalon], axis = 0).reset_index(drop=True)
fold_4_avalon = pd.concat([fold_4_active_avalon, fold_4_inactive_avalon], axis = 0).reset_index(drop=True)


In [476]:
rdk5_train = pd.concat([fold_1_rdk5, fold_2_rdk5, fold_3_rdk5, fold_4_rdk5], axis = 0).reset_index(drop=True)
rdk5_test = test_rdk5
rdk5_valid = valid_rdk5
rdk5_train.to_csv("./data/Official/Featurizer_data/rdk5_train.csv", index = False)
rdk5_test.to_csv("./data/Official/Featurizer_data/rdk5_test.csv", index = False)
rdk5_valid.to_csv("./data/Official/Featurizer_data/rdk5_valid.csv", index = False)

fold_1_rdk5.to_csv(f"{fold_1_path}rdk5.csv", index=False)
fold_2_rdk5.to_csv(f"{fold_2_path}rdk5.csv", index=False)
fold_3_rdk5.to_csv(f"{fold_3_path}rdk5.csv", index=False)
fold_4_rdk5.to_csv(f"{fold_4_path}rdk5.csv", index=False)

In [477]:
rdk6_train = pd.concat([fold_1_rdk6, fold_2_rdk6, fold_3_rdk6, fold_4_rdk6], axis = 0).reset_index(drop=True)
rdk6_test = test_rdk6
rdk6_valid = valid_rdk6
rdk6_train.to_csv("./data/Official/Featurizer_data/rdk6_train.csv", index = False)
rdk6_test.to_csv("./data/Official/Featurizer_data/rdk6_test.csv", index = False)
rdk6_valid.to_csv("./data/Official/Featurizer_data/rdk6_valid.csv", index = False)

fold_1_rdk6.to_csv(f"{fold_1_path}rdk6.csv", index=False)
fold_2_rdk6.to_csv(f"{fold_2_path}rdk6.csv", index=False)
fold_3_rdk6.to_csv(f"{fold_3_path}rdk6.csv", index=False)
fold_4_rdk6.to_csv(f"{fold_4_path}rdk6.csv", index=False)

In [478]:
mordred_train = pd.concat([fold_1_mordred, fold_2_mordred, fold_3_mordred, fold_4_mordred], axis = 0).reset_index(drop=True)
mordred_test = test_mordred
mordred_valid = valid_mordred
mordred_train.to_csv("./data/Official/Featurizer_data/mordred_train.csv", index = False)
mordred_test.to_csv("./data/Official/Featurizer_data/mordred_test.csv", index = False)
mordred_valid.to_csv("./data/Official/Featurizer_data/mordred_valid.csv", index = False)

fold_1_mordred.to_csv(f"{fold_1_path}mordred.csv", index=False)
fold_2_mordred.to_csv(f"{fold_2_path}mordred.csv", index=False)
fold_3_mordred.to_csv(f"{fold_3_path}mordred.csv", index=False) 
fold_4_mordred.to_csv(f"{fold_4_path}mordred.csv", index=False)

In [479]:
avalon_train = pd.concat([fold_1_avalon, fold_2_avalon, fold_3_avalon, fold_4_avalon], axis = 0).reset_index(drop=True)
avalon_test = test_avalon
avalon_valid = valid_avalon
avalon_train.to_csv("./data/Official/Featurizer_data/avalon_train.csv", index = False)
avalon_test.to_csv("./data/Official/Featurizer_data/avalon_test.csv", index = False)
avalon_valid.to_csv("./data/Official/Featurizer_data/avalon_valid.csv", index = False)

fold_1_avalon.to_csv(f"{fold_1_path}avalon.csv", index=False)
fold_2_avalon.to_csv(f"{fold_2_path}avalon.csv", index=False)
fold_3_avalon.to_csv(f"{fold_3_path}avalon.csv", index=False)
fold_4_avalon.to_csv(f"{fold_4_path}avalon.csv", index=False)

In [480]:
ph4_train = pd.concat([fold_1_ph4, fold_2_ph4, fold_3_ph4, fold_4_ph4], axis = 0).reset_index(drop=True)
ph4_test = test_ph4
ph4_valid = valid_ph4
ph4_train.to_csv("./data/Official/Featurizer_data/ph4_train.csv", index = False)
ph4_test.to_csv("./data/Official/Featurizer_data/ph4_test.csv", index = False)
ph4_valid.to_csv("./data/Official/Featurizer_data/ph4_valid.csv", index = False)

fold_1_ph4.to_csv(f"{fold_1_path}ph4.csv", index=False)
fold_2_ph4.to_csv(f"{fold_2_path}ph4.csv", index=False)
fold_3_ph4.to_csv(f"{fold_3_path}ph4.csv", index=False)
fold_4_ph4.to_csv(f"{fold_4_path}ph4.csv", index=False)

## **BM scaffold**

In [22]:
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
def create_scaffold_df(data):
    scaffolds = {}
    for idx, row in data.iterrows():
        smiles = row["Standardize_smile"]
        mol = Chem.MolFromSmiles(smiles)
        scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False)
        if scaffold not in scaffolds:
            scaffolds[scaffold] = [idx]
        else:
            scaffolds[scaffold].append(idx)
    scaffold_clean = [x for x in list(scaffolds.keys()) if x!= '']
    df_dict = {"ID": [], "Scaffold": scaffold_clean, "Activity": []}
    df_dict["Activity"] = [1 if any(data.loc[scaffolds[scaffold], "Activity"]==1) else 0 for scaffold in scaffold_clean]
    df_dict["ID"] = [x for x in range(len(scaffold_clean))]
    df_scaffold = pd.DataFrame(df_dict)
    return df_scaffold

In [484]:
def extract_scaffold(data_train, data_valid, data_test):
    active_train_df = data_train[data_train["Activity"]==1]
    active_train_scaffold = create_scaffold_df(active_train_df)
    inactive_train_df = data_train[data_train["Activity"]==0]
    inactive_train_scaffold = create_scaffold_df(inactive_train_df)
    active_valid_df = data_valid[data_valid["Activity"]==1]
    active_valid_scaffold = create_scaffold_df(active_valid_df)
    inactive_valid_df = data_valid[data_valid["Activity"]==0]
    inactive_valid_scaffold = create_scaffold_df(inactive_valid_df)
    active_test_df = data_test[data_test["Activity"]==1]
    active_test_scaffold = create_scaffold_df(active_test_df)
    inactive_test_df = data_test[data_test["Activity"]==0]
    inactive_test_scaffold = create_scaffold_df(inactive_test_df)
    return active_train_scaffold, inactive_train_scaffold, active_valid_scaffold, inactive_valid_scaffold, active_test_scaffold, inactive_test_scaffold
active_train_rdk7, inactive_train_rdk7, active_valid_rdk7, inactive_valid_rdk7, active_test_rdk7, inactive_test_rdk7 = extract_scaffold(rdk7_train, rdk7_valid, rdk7_test)
scaffold_train = pd.concat([active_train_rdk7, inactive_train_rdk7], axis = 0).reset_index(drop=True)
scaffold_valid = pd.concat([active_valid_rdk7, inactive_valid_rdk7], axis = 0).reset_index(drop=True)
scaffold_test = pd.concat([active_test_rdk7, inactive_test_rdk7], axis = 0).reset_index(drop=True)


In [161]:
scaffold_train.to_csv("./data/Official/Featurizer_data/scaffold_train.csv", index = False)
scaffold_valid.to_csv("./data/Official/Featurizer_data/scaffold_valid.csv", index = False)
scaffold_test.to_csv("./data/Official/Featurizer_data/scaffold_test.csv", index = False)