In [1]:
    ######## PREPARING DATA  ########

In [2]:
import matplotlib.pyplot as plt
import numpy as nm
import pandas as pd
import math
from rdkit import Chem
from tqdm import tqdm
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn import preprocessing as pre
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from rdkit.Chem.QED import qed

In [7]:
# INPUT
folder = 'C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CTRPv2.0_2015_ctd2_ExpandedDataset\\'
experiment_id = 419
df_large = pd.read_csv(f"{folder}v20.data.curves_post_qc.txt", sep='\t')
df_smiles = pd.read_csv(f"{folder}v20.meta.per_compound.txt", sep='\t')

# BUNDELING
df_large=df_large[df_large['experiment_id'] == experiment_id]
df_summary = df_large[[ 'master_cpd_id','apparent_ec50_umol']]
extracted_col = df_smiles[["master_cpd_id","cpd_smiles"]]
df_summary_sorted = pd.merge(df_summary, extracted_col, on='master_cpd_id', how='left')

# MOL DESCRIPTORS
def mol_descriptor(smiles: list, scale: bool = True) -> nm.ndarray:
    X = []
    for smi in tqdm(smiles):
        m = Chem.MolFromSmiles(smi)
        x = nm.array([Descriptors.TPSA(m),
                      Descriptors.MolLogP(m),
                      Descriptors.MolWt(m),
                      Descriptors.FpDensityMorgan2(m),
                      Descriptors.HeavyAtomMolWt(m),
                      Descriptors.MaxPartialCharge(m),
                      Descriptors.MinPartialCharge(m),
                      Descriptors.NumRadicalElectrons(m),
                      Descriptors.NumValenceElectrons(m),
                      rdMolDescriptors.CalcFractionCSP3(m),
                      rdMolDescriptors.CalcNumRings(m),
                      rdMolDescriptors.CalcNumRotatableBonds(m),
                      rdMolDescriptors.CalcNumLipinskiHBD(m),
                      rdMolDescriptors.CalcNumLipinskiHBA(m),
                      rdMolDescriptors.CalcNumHeterocycles(m),
                      rdMolDescriptors.CalcNumHeavyAtoms(m),
                      rdMolDescriptors.CalcNumAromaticRings(m),
                      rdMolDescriptors.CalcNumAtoms(m),
                      qed(m)])
        X.append(x)

    if scale:
        return pre.MinMaxScaler().fit_transform(nm.array(X))

    return nm.array(X)


# EXTENDING DATAFRAME
smiles_column = df_summary_sorted['cpd_smiles']
descriptors = mol_descriptor(smiles_column)

df_summary_sorted[['TPSA', 'MolLogP', 'MolWt', 'FpDensityMorgan2', 'HeavyAtomMolWt', 'MaxPartialCharge', 'MinPartialCharge', 'NumRadicalElectrons', 'NumValenceElectrons', 'CalcFractionCSP3', 'CalcNumRings', 'CalcNumRotatableBonds', 'CalcNumLipinskiHBD', 'CalcNumLipinskiHBA', 'CalcNumHeterocycles', 'CalcNumHeavyAtoms', 'CalcNumAromaticRings', 'CalcNumAtoms', 'qed']] = descriptors

# ECFP Maken
molecules = [Chem.MolFromSmiles(smile) for smile in df_summary_sorted['cpd_smiles'].tolist()]
ecfp = [AllChem.GetMorganFingerprintAsBitVect(molecule,2,nBits=1024) for molecule in molecules]
df_summary_sorted['ecfp_bit_vectors'] = [[int(bit) for bit in keys.ToBitString()] for keys in ecfp]
df_summary_sorted['ECFP'] = [''.join(str(value) for value in row) for row in df_summary_sorted['ecfp_bit_vectors']]

# TRANSFORMATION EC50 VALUE
df_summary_sorted['ec50_mol'] = df_summary_sorted['apparent_ec50_umol'] / 1000000
df_summary_sorted['ec50_mol']=df_summary_sorted['ec50_mol'].replace(0, 1e-10)
df_summary_sorted['ec50_molair'] = df_summary_sorted['ec50_mol']/ df_summary_sorted['MolWt']
df_summary_sorted['ec50_molair_transformed'] = -nm.log10(df_summary_sorted['ec50_molair'])
condition = (df_summary_sorted['ec50_molair_transformed'] < 2 ) | (df_summary_sorted['ec50_molair_transformed'] > 8)
df_summary_sorted=df_summary_sorted[~condition]


#SAVING RESULTS
df_summary_sorted.to_csv(f"{folder}v20.data.final_summary_{experiment_id}.txt", sep='\t', index=False)

  0%|          | 0/446 [00:00<?, ?it/s]


AttributeError: 'numpy.ndarray' object has no attribute 'TPSA'