In [1]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem.ChemUtils import SDFToCSV
IPythonConsole.ipython_useSVG=True
from rdkit.Chem import AllChem
import csv
import pandas as pd

## Read file and filter experiment

In [2]:
folder = "C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CTRPv2.0_2015_ctd2_ExpandedDataset\\"
generate = False
if generate == True:
    df = pd.read_csv(f"{folder}v20.data.per_cpd_well.txt", sep='\t')
    filtered_df = df[df['experiment_id'] == 419].reset_index(drop=True)
    filtered_df.to_csv(f"{folder}v20.data.per_cpd_well_419.txt", sep='\t', index=False)
else:
    filtered_df = pd.read_csv(f"{folder}v20.data.per_cpd_well_419.txt", sep='\t')

In [3]:
filtered_df

Unnamed: 0,experiment_id,assay_plate_barcode,raw_value_log2,bsub_value_log2,dmso_zscore_log2,cpd_conc_umol,master_cpd_id
0,419,AU053858,11.560,-0.044030,-0.12760,0.00030,1788
1,419,AU062968,11.420,-0.059780,-0.17330,0.00030,1788
2,419,AU053858,11.630,0.024330,0.07051,0.00061,1788
3,419,AU062968,11.430,-0.053900,-0.15620,0.00061,1788
4,419,AU053858,11.600,-0.000697,-0.00202,0.00120,1788
...,...,...,...,...,...,...,...
13574,419,AU062970,10.930,-0.346200,-1.00300,17.00000,710154
13575,419,AU062969,11.260,-0.068620,-0.19890,33.00000,710154
13576,419,AU062970,10.710,-0.558700,-1.62000,33.00000,710154
13577,419,AU062969,10.910,-0.417100,-1.20900,66.00000,710154


In [4]:
filtered_df['master_cpd_id'].value_counts()

290356    90
659993    60
595102    32
616408    32
632104    32
          ..
58472     15
580922    15
660890    14
411863    14
411724    13
Name: master_cpd_id, Length: 448, dtype: int64

In [None]:
from collections import defaultdict
input_file = "C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CTRPv2.0_2015_ctd2_ExpandedDataset\\v20.data.curves_post_qc.txt"

def get_unique_numbers(input_file):
    numbers = defaultdict(int)
    with open(input_file, 'r') as f:
        for line in f:
            line = line.strip()
            cols = line.split('\t')
            if len(cols) >= 17:
                number = cols[16]
                numbers[number] += 1

    return numbers

unique_numbers = get_unique_numbers(input_file)
for number, frequency in unique_numbers.items():
    print(number, frequency)


In [None]:
input_file = "C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CTRPv2.0_2015_ctd2_ExpandedDataset\\v20.data.curves_post_qc_419.txt"
output_file = "C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CTRPv2.0_2015_ctd2_ExpandedDataset\\v20.data.curves_post_qc_419_summary.txt"

def copy_columns(input_file, output_file, column_indices):
    with open(input_file, 'r') as f_input, open(output_file, 'w') as f_output:
        for line in f_input:
            columns = line.strip().split('\t')
            selected_columns = [columns[i] for i in column_indices]
            f_output.write('\t'.join(selected_columns) + '\n')
            
column_indices = [0, 9, 13, 15, 16]  

copy_columns(input_file, output_file, column_indices)


In [None]:
df_large = pd.read_csv(f"{folder}v20.data.curves_post_qc_419.txt", sep='\t')
df_summary = df_large[['experiment_id', 'p1_center', 'apparent_ec50_umol', 'area_under_curve', 'master_cpd_id']]

In [None]:
# df_summary = pd.read_csv(f"{folder}v20.data.curves_post_qc_419_summary.txt", sep='\t')
df_summary

In [None]:
df_summary.sort_values(by=['apparent_ec50_umol']).head(25)

In [None]:
df_smiles = pd.read_csv(f"{folder}v20.meta.per_compound.txt", sep="\t")
df_smiles.head()

In [None]:
df_all = pd.merge(df_summary, df_smiles, on='master_cpd_id', how='left')

In [None]:
df_all_sorted = df_all.sort_values(by=['apparent_ec50_umol'])

In [None]:
df_all_sorted.to_csv(f"{folder}v20.data.final_summary.txt", sep='\t', index=False)

In [None]:
import numpy as np
from tqdm.auto import tqdm
from rdkit.Chem.QED import qed
from rdkit.Chem import Descriptors, rdMolDescriptors
from sklearn import preprocessing as pre

def mol_descriptor(smiles: list[str], scale: bool = True) -> np.ndarray:

    X = []
    for smi in tqdm(smiles):
        m = Chem.MolFromSmiles(smi)
        x = np.array([Descriptors.TPSA(m),
                      Descriptors.MolLogP(m),
                      Descriptors.MolWt(m),
                      Descriptors.FpDensityMorgan2(m),
                      Descriptors.HeavyAtomMolWt(m),
                      Descriptors.MaxPartialCharge(m),
                      Descriptors.MinPartialCharge(m),
                      Descriptors.NumRadicalElectrons(m),
                      Descriptors.NumValenceElectrons(m),
                      rdMolDescriptors.CalcFractionCSP3(m),
                      rdMolDescriptors.CalcNumRings(m),
                      rdMolDescriptors.CalcNumRotatableBonds(m),
                      rdMolDescriptors.CalcNumLipinskiHBD(m),
                      rdMolDescriptors.CalcNumLipinskiHBA(m),
                      rdMolDescriptors.CalcNumHeterocycles(m),
                      rdMolDescriptors.CalcNumHeavyAtoms(m),
                      rdMolDescriptors.CalcNumAromaticRings(m),
                      rdMolDescriptors.CalcNumAtoms(m),
                      qed(m)])
        X.append(x)

    if scale:
        return pre.MinMaxScaler().fit_transform(np.array(X))
    
    return np.array(X)

 
smiles_column = df_all_sorted['cpd_smiles']

# Call the mol_descriptor function with the 'cpd_smiles' values
descriptors = mol_descriptor(smiles_column)

# Add the descriptors to your existing DataFrame
df_all_sorted[['TPSA', 'MolLogP', 'MolWt', 'FpDensityMorgan2', 'HeavyAtomMolWt',
               'MaxPartialCharge', 'MinPartialCharge', 'NumRadicalElectrons',
               'NumValenceElectrons', 'CalcFractionCSP3', 'CalcNumRings',
               'CalcNumRotatableBonds', 'CalcNumLipinskiHBD', 'CalcNumLipinskiHBA',
               'CalcNumHeterocycles', 'CalcNumHeavyAtoms', 'CalcNumAromaticRings',
               'CalcNumAtoms', 'qed']] = descriptors
df_all_sorted.to_csv(f"{folder}v20.data.final_summary.txt", sep='\t', index=False)


# Print the modified DataFrame
df_all_sorted