In [108]:
import matplotlib.pyplot as plt
import numpy as nm
import pandas as pd
import math
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.model_selection import train_test_split

In [65]:
folder = ''
complete_df = pd.read_csv(f"{folder}scaffold_split.txt", sep="\t")

In [66]:
# Fingerprint aanmaken
molecules = [Chem.MolFromSmiles(smile) for smile in complete_df['cpd_smiles'].tolist()]
ecfp = [AllChem.GetMorganFingerprintAsBitVect(molecule,2,nBits=1024) for molecule in molecules]
complete_df['ecfp_bit_vectors'] = [[int(bit) for bit in keys.ToBitString()] for keys in ecfp]
complete_df['ECFP'] = [''.join(str(value) for value in row) for row in complete_df['ecfp_bit_vectors']]


In [67]:
complete_df['ec50_mol'] = complete_df['apparent_ec50_umol'] / 1000000
complete_df['ec50_mol']=complete_df['ec50_mol'].replace(0, 1e-10)
complete_df['ec50_molair'] = complete_df['ec50_mol']/ complete_df['MolWt']
complete_df['ec50_molair_transformed'] = -nm.log10(complete_df['ec50_molair'])
condition = (complete_df['ec50_molair_transformed'] < 1 ) | (complete_df['ec50_molair_transformed'] > 10)
complete_df=complete_df[~condition]

In [73]:
# We removed molair transformed, check again if the scaffold belongs to a group
number_counts = Counter(complete_df['recurring_scaffold'])
complete_df['count_recurring_scaffold'] = complete_df.groupby('recurring_scaffold')['recurring_scaffold'].transform('count')
complete_df.loc[complete_df['count_recurring_scaffold'] == 1, 'recurring_scaffold'] = 0
complete_df['count_recurring_scaffold'] = complete_df.groupby('recurring_scaffold')['recurring_scaffold'].transform('count')
complete_df = complete_df.sort_values(by=['count_recurring_scaffold', 'recurring_scaffold'], ascending=True)

In [74]:
total_size = len(complete_df)
train_size = round(total_size * 0.8)
test_size = total_size - train_size 
scaffolds_size = len(complete_df[complete_df['recurring_scaffold'] != 0])
scaffolds_ratio = scaffolds_size / total_size
scaffolds_in_train = round(train_size * scaffolds_ratio)
scaffolds_in_test = scaffolds_size - scaffolds_in_train
print(f"We must divide the scaffolds such that there are {scaffolds_in_test} in the test,"\
      f" and {scaffolds_in_train} in the train")

We must divide the scaffolds such that there are 9 in the test, and 37 in the train


In [126]:
scaffold_train = pd.DataFrame()
scaffold_test = pd.DataFrame()
unique_counts = complete_df[complete_df['recurring_scaffold'] != 0]['recurring_scaffold'].value_counts(ascending=True)
put_in_train = False
# Divide scaffold among train and test
for i, v in unique_counts.items():
    if len(scaffold_test) < scaffolds_in_test and put_in_train == False:
        scaffold_test = pd.concat([scaffold_test, complete_df[complete_df['recurring_scaffold'] == i]])
        put_in_train = True
    else:
        scaffold_train = pd.concat([scaffold_train, complete_df[complete_df['recurring_scaffold'] == i]])
        put_in_train = False

non_scaffold_train, non_scaffold_test = train_test_split(complete_df[complete_df['recurring_scaffold'] == 0], test_size=0.2, random_state=42)
train = pd.concat([scaffold_train, non_scaffold_train])
test = pd.concat([scaffold_test, non_scaffold_test])

In [135]:
train.sort_values(['recurring_scaffold'], ascending=False).head(4)

Unnamed: 0,apparent_ec50_umol,MolWt,cpd_smiles,spd_scaffold,recurring_scaffold,ecfp_bit_vectors,ECFP,ec50_mol,ec50_molair,ec50_molair_transformed,count_recurring_scaffold
3,69.4,0.08022,CC1=NN(C(=O)C\1=N\Nc1ccccc1C(O)=O)c1nc(cs1)-c1...,O=C1/C(=N/Nc2ccccc2)C=NN1c1nc(-c2ccccc2)cs1,22,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0001000000000000000000010000000001000000000000...,6.94e-05,0.000865,3.062922,2
2,0.5328,0.080233,CCOc1ccccc1N\N=C1C(=O)N(N=C\1C)c1nc(cs1)-c1ccccc1,O=C1/C(=N/Nc2ccccc2)C=NN1c1nc(-c2ccccc2)cs1,22,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0001000000000000000000000000000001000000000000...,5.328e-07,7e-06,5.177787,2
9,30.74,0.068278,CC(Nc1ccccc1)c1cc(C)cn2c1nc(cc2=O)N1CCOCC1,O=c1cc(N2CCOCC2)nc2c(CNc3ccccc3)cccn12,19,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0100000000000000000000000000100001000000000000...,3.074e-05,0.00045,3.346579,2
8,9.76,0.081099,C[C@@H](Nc1ccccc1C(O)=O)c1cc(C)cn2c1nc(cc2=O)N...,O=c1cc(N2CCOCC2)nc2c(CNc3ccccc3)cccn12,19,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0100000000000000000000010000100001000000000000...,9.76e-06,0.00012,3.919567,2


In [137]:
test.sort_values(['recurring_scaffold'], ascending=False).head(4)

Unnamed: 0,apparent_ec50_umol,MolWt,cpd_smiles,spd_scaffold,recurring_scaffold,ecfp_bit_vectors,ECFP,ec50_mol,ec50_molair,ec50_molair_transformed,count_recurring_scaffold
5,0.3449,0.132722,CN(C)[C@H]1[C@@H]2C[C@@H]3Cc4c(cc(NC(=O)CNC(C)...,O=C1CC(=O)C2C(=O)C3C(=O)c4ccccc4C[C@H]3C[C@H]2C1,21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0000000000000000000000000000000001001000010000...,3.449e-07,3e-06,5.585251,2
4,4.981,0.070287,NC(=O)C1C(=O)C[C@@H]2C[C@@H]3Cc4cccc(O)c4C(=O)...,O=C1CC(=O)C2C(=O)C3C(=O)c4ccccc4C[C@H]3C[C@H]2C1,21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",0000000000010001000000000000000000001000000000...,4.981e-06,7.1e-05,4.149561,2
11,0.2935,0.103287,CC(C)NC(=O)Nc1ccc2O[C@@H](CN(C)S(C)(=O)=O)[C@@...,O=C1Cc2ccccc2OCCCN1,18,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",0100100000010000000000000000000001000000000000...,2.935e-07,3e-06,5.546438,2
10,0.4939,0.063916,CNC[C@@H]1Oc2ccc(cc2CC(=O)N(C[C@@H]1C)[C@@H](C...,O=C1Cc2ccccc2OCCCN1,18,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",0100000000010000000000000000000001000000000000...,4.939e-07,8e-06,5.111971,2


In [138]:
x_train = list(train['ECFP'])
y_train = list(train['ec50_molair'])

x_test = list(test['ECFP'])
y_test = list(test['ec50_molair'])