In [1]:
from pathlib import Path

import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import numbers

from sklearn.metrics import matthews_corrcoef

In [2]:
INITIAL_PATH = Path.cwd().parent
DATA_PATH = INITIAL_PATH / 'data'
INITIAL_DATA_PATH = DATA_PATH / 'initial'
PREPROCESSED_DATA_PATH = DATA_PATH / 'preprocessed'

In [3]:
INITIAL_KLEKOTA_ROTH_DATA = INITIAL_DATA_PATH / 'cardiotoxicity_hERG_KlekFP_ready_set.csv'
INITIAL_MACCS_DATA = INITIAL_DATA_PATH / 'cardiotoxicity_hERG_MACCSFP_ready_set.csv'
INITIAL_EXT_DATA = INITIAL_DATA_PATH / 'cardiotoxicity_hERG_ExtFP_ready_set.csv'

PREPROCESSED_KLEKOTA_ROTH_DATA = PREPROCESSED_DATA_PATH / 'cardiotoxicity_hERG_KlekFP.csv'
PREPROCESSED_MACCS_DATA = PREPROCESSED_DATA_PATH / 'cardiotoxicity_hERG_MACCSFP.csv'
PREPROCESSED_EXT_DATA = PREPROCESSED_DATA_PATH / 'cardiotoxicity_hERG_ExtFP.csv'

PREPROCESSED_KLEKOTA_ROTH__MACCS_DATA = PREPROCESSED_DATA_PATH / 'cardiotoxicity_hERG_KlekFP-MACCSFP.csv'
PREPROCESSED_KLEKOTA_ROTH__EXT_DATA = PREPROCESSED_DATA_PATH / 'cardiotoxicity_hERG_KlekFP-ExtFP.csv'
PREPROCESSED_MACCS__EXT_DATA = PREPROCESSED_DATA_PATH / 'cardiotoxicity_hERG_MACCSFP-ExtFP.csv'
PREPROCESSED_KLEKOTA_ROTH__MACCS__EXT_DATA = PREPROCESSED_DATA_PATH / 'cardiotoxicity_hERG_KlekFP-MACCSFP-ExtFP.csv'

In [4]:
SPLITTED_DATA_PATH = PREPROCESSED_DATA_PATH / 'splitted'
PREPROCESSED_KLEKOTA_ROTH_DATA_X = SPLITTED_DATA_PATH / '{}/cardiotoxicity_hERG_KlekFP[{}].csv'
PREPROCESSED_MACCS_DATA_X = SPLITTED_DATA_PATH / '{}/cardiotoxicity_hERG_MACCSFP[{}].csv'
PREPROCESSED_EXT_DATA_X = SPLITTED_DATA_PATH / '{}/cardiotoxicity_hERG_ExtFP[{}].csv'

PREPROCESSED_KLEKOTA_ROTH__MACCS_DATA_X = SPLITTED_DATA_PATH / '{}/cardiotoxicity_hERG_KlekFP-MACCSFP[{}].csv'
PREPROCESSED_KLEKOTA_ROTH__EXT_DATA_X = SPLITTED_DATA_PATH / '{}/cardiotoxicity_hERG_KlekFP-ExtFP[{}].csv'
PREPROCESSED_MACCS__EXT_DATA_X = SPLITTED_DATA_PATH / '{}/cardiotoxicity_hERG_MACCSFP-ExtFP[{}].csv'
PREPROCESSED_KLEKOTA_ROTH__MACCS__EXT_DATA_X = SPLITTED_DATA_PATH / '{}/cardiotoxicity_hERG_KlekFP-MACCSFP-ExtFP.csv[{}]'

In [5]:
class DataCleaner():
    
    def apply(self, input_filepath, output_filepath):
        df = pd.read_csv(input_filepath)
        df = df[df['IC50'].apply(self.__is_float)]
        df['IC50'] = df['IC50'].astype('float64')
        df[1:] = df[1:].astype('int')
    
        columns_without_difference = [k for k,v in df.apply(lambda col: col.unique()).apply(lambda x: len(x) == 1).items() if v]
        df = df.drop(columns=columns_without_difference)
        
        is_toxic = df.apply(self.__calculate_if_toxic, axis=1)
        df.insert(1, 'toxic', is_toxic)
        
        df.to_csv(output_filepath, index = False)
        
    @staticmethod
    def __is_float(value):
        try:
            float(value)
            return True
        except:
            return False
        
    @staticmethod
    def __calculate_if_toxic(row):
        if row['IC50'] < 10000:
            return 1
        return 0

In [6]:
data_to_process = [
    (INITIAL_KLEKOTA_ROTH_DATA, PREPROCESSED_KLEKOTA_ROTH_DATA), 
    (INITIAL_MACCS_DATA, PREPROCESSED_MACCS_DATA), 
    (INITIAL_EXT_DATA, PREPROCESSED_EXT_DATA)
]

data_cleaner = DataCleaner()

for data in data_to_process:
    data_cleaner.apply(data[0], data[1])

  data_cleaner.apply(data[0], data[1])


In [7]:
class DataCombiner():
    
    def apply(self, input_filepaths, output_filepath):
        df = pd.read_csv(input_filepaths[0])
        
        for input_filepath in input_filepaths[1:]:
            df_temp = pd.read_csv(input_filepath)
            df_temp = df_temp.drop(['IC50', 'toxic'], axis=1)
            df = df.join(df_temp)
        
        df.to_csv(output_filepath, index = False)

In [8]:
data_to_process = [
    ([PREPROCESSED_KLEKOTA_ROTH_DATA, PREPROCESSED_MACCS_DATA], PREPROCESSED_KLEKOTA_ROTH__MACCS_DATA), 
    ([PREPROCESSED_KLEKOTA_ROTH_DATA, PREPROCESSED_EXT_DATA], PREPROCESSED_KLEKOTA_ROTH__EXT_DATA), 
    ([PREPROCESSED_MACCS_DATA, PREPROCESSED_EXT_DATA], PREPROCESSED_MACCS__EXT_DATA),
    ([PREPROCESSED_KLEKOTA_ROTH_DATA, PREPROCESSED_MACCS_DATA, PREPROCESSED_EXT_DATA], PREPROCESSED_KLEKOTA_ROTH__MACCS__EXT_DATA)
]

data_combiner = DataCombiner()

for data in data_to_process:
    data_combiner.apply(data[0], data[1])

In [9]:
class FeatureSplitter():
    
    def apply(self, input_filepath, output_filepath, number_of_features):
        df = pd.read_csv(input_filepath)
        
        toxic_column = df['toxic']
        df_temp = df.drop(['IC50', 'toxic'], axis=1)
        if number_of_features > len(df_temp.columns):
            print(f"Skipping features splitting for {input_filepath}, because of not enough features: {number_of_features}")
            return
        
        results = [matthews_corrcoef(toxic_column, df_temp[column]) for column in df_temp.columns]
        sorted_results = sorted(enumerate(results), key=lambda x:x[1])[::-1]
        
        wanted_column_indexes = [column_index for column_index, value in sorted_results[0:number_of_features]]
        wanted_columns = df_temp[df_temp.columns[wanted_column_indexes]]
        
        output = df[['IC50', 'toxic']].join(wanted_columns)
        
        output_path = Path(str(output_filepath).format(number_of_features, number_of_features))
        print(f'Saving splitted features to file {output_path}')
        
        output.to_csv(output_path, index = False)


In [10]:
data_to_process = [
    (PREPROCESSED_KLEKOTA_ROTH_DATA, PREPROCESSED_KLEKOTA_ROTH_DATA_X), 
    (PREPROCESSED_MACCS_DATA, PREPROCESSED_MACCS_DATA_X), 
    (PREPROCESSED_EXT_DATA, PREPROCESSED_EXT_DATA_X),
    
    (PREPROCESSED_KLEKOTA_ROTH__MACCS_DATA, PREPROCESSED_KLEKOTA_ROTH__MACCS_DATA_X), 
    (PREPROCESSED_KLEKOTA_ROTH__EXT_DATA, PREPROCESSED_KLEKOTA_ROTH__EXT_DATA_X), 
    (PREPROCESSED_MACCS__EXT_DATA, PREPROCESSED_MACCS__EXT_DATA_X),
    (PREPROCESSED_KLEKOTA_ROTH__MACCS__EXT_DATA, PREPROCESSED_KLEKOTA_ROTH__MACCS__EXT_DATA_X)
]

feature_splitter = FeatureSplitter()

for x in range(500, 2500, 500):
    for data in data_to_process:
        feature_splitter.apply(data[0], data[1], x)

Saving splitted features to file C:\Users\SG0306249\Downloads\cardiotoxicity_prediction-main\cardiotoxicity_prediction-main\data\preprocessed\splitted\500\cardiotoxicity_hERG_KlekFP[500].csv
Skipping features splitting for C:\Users\SG0306249\Downloads\cardiotoxicity_prediction-main\cardiotoxicity_prediction-main\data\preprocessed\cardiotoxicity_hERG_MACCSFP.csv, because of not enough features: 500
Saving splitted features to file C:\Users\SG0306249\Downloads\cardiotoxicity_prediction-main\cardiotoxicity_prediction-main\data\preprocessed\splitted\500\cardiotoxicity_hERG_ExtFP[500].csv
Saving splitted features to file C:\Users\SG0306249\Downloads\cardiotoxicity_prediction-main\cardiotoxicity_prediction-main\data\preprocessed\splitted\500\cardiotoxicity_hERG_KlekFP-MACCSFP[500].csv
Saving splitted features to file C:\Users\SG0306249\Downloads\cardiotoxicity_prediction-main\cardiotoxicity_prediction-main\data\preprocessed\splitted\500\cardiotoxicity_hERG_KlekFP-ExtFP[500].csv
Saving splitt