# 1. Import library and data

In [None]:
!pip install duckdb
!pip install rdkit


In [None]:
import pandas as pd
import numpy as np
import duckdb

from pathlib import Path

from tqdm import tqdm_notebook
from tqdm import tqdm
tqdm.pandas()

In [None]:
TRAIN_CSV = Path('../../data/raw/train.csv')
TEST_CSV = Path('../../data/raw/test.csv')
TRAIN_PAR = Path('../../data/raw/train.parquet')
TEST_PAR = Path('../../data/raw/test.parquet')

con = duckdb.connect()

In [None]:
train = con.query(f"""(SELECT *
                        FROM parquet_scan('{TRAIN_PAR}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 300000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{TRAIN_PAR}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 300000)""").df()

test = con.query(f"""(SELECT *
                        FROM parquet_scan('{TEST_PAR}'))""").df()

test_1 = con.query(f"""(SELECT *
                        FROM parquet_scan('{TEST_PAR}')
                        OFFSET 0 LIMIT 600000)""").df()
test_2 = con.query(f"""(SELECT *
                        FROM parquet_scan('{TEST_PAR}')
                        OFFSET 600000 LIMIT 600000)""").df()
test_3 = con.query(f"""(SELECT *
                        FROM parquet_scan('{TEST_PAR}')
                        OFFSET 1200000 LIMIT 600000)""").df()


# con.close()

In [None]:
display(train.head())
display(test_1.head())

print(len(test_3))

# 2. Create ECFP from SMILES

Import

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold

Config

In [None]:
class CFG:
    N_FOLDS = 5
    RANDOM_SEED = 42

In [None]:
class Preprocessing():
    def __init__(self, 
                 df: pd.DataFrame,
                 output_path='/',
                 is_output: bool=False,
                 is_test: bool=False,
                 ):
        self.df = df
        self.output_path = output_path
        self.is_output = is_output
        self.is_test = is_test
    
    def create_ecfp(self):
        if not 'molecule' in self.df.columns:
            # Convert SMILES to RDKit molecules
            self.df['molecule'] = self.df['molecule_smiles'].apply(Chem.MolFromSmiles)

            # Generate ECFPs
            def generate_ecfp(molecule, radius=2, bits=1024):
                if molecule is None:
                    return None
                return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))
            
            self.df['ecfp'] = self.df['molecule'].progress_apply(generate_ecfp);
    
    def OneHotEncode_protein(self):
        if not 'protein_BRD4' in self.df.columns:
            one_hot_encoded = pd.get_dummies(self.df['protein_name'], prefix='protein', dtype=int)
            self.df = pd.concat([self.df, one_hot_encoded], axis=1)
            
    def split_fold(self):
        if not 'fold' in self.df.columns:
            self.df['fold'] = -1
            # object
            skf = KFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.RANDOM_SEED)

            for i, (_train_index, test_index) in enumerate(skf.split(self.df)):
                self.df.loc[test_index, 'fold'] = i
                
    def create_any_binds(self):
        if not 'any_binds' in self.df.columns:
            # add column and init as 0
            self.df['any_binds'] = 0
            # Set 'any_binds' to 1 where 'binds' is 1
            self.df.loc[self.df['binds'] == 1, 'any_binds'] = 1
            # Propagate 'any_binds' to all rows with the same 'molecule_smiles' value
            self.df['any_binds'] = self.df.groupby('molecule_smiles')['any_binds'].transform('max')

    def save_df_as_pickle(self):
        self.df.to_pickle(self.output_path)
        # if self.is_test: # for test
        #     columns_to_drop = [
        #         'buildingblock1_smiles',
        #         'buildingblock2_smiles',
        #         'buildingblock3_smiles',
        #         'molecule_smiles',
        #         'protein_name',
        #         'molecule'
        #     ]
        #     self.df.drop(columns=columns_to_drop, inplace=True)
        #     self.df.to_pickle(self.output_path)
        # else: # for train
        #     self.df.to_pickle(self.output_path)
            
                        
    def main(self):
        print('Processing...')
        
        self.create_ecfp()
        self.OneHotEncode_protein()
        if not self.is_test:
            self.split_fold()
            self.create_any_binds()
        if self.is_output:
            self.save_df_as_pickle()
            
        print('complete!')
        return self.df

one-hot encoding to tartget column

In [None]:
%%capture

# maybe 30s at first time
Preprocess = Preprocessing(train, output_path='../../data/processed/ecfp_60000_50per.pkl', is_output=False)
df_processed = Preprocess.main()

In [None]:
# check
display(df_processed[df_processed['binds']==1].head())
display(df_processed[df_processed['any_binds']==1].head())


In [None]:
# %%capture

# # maybe 17m at first time
# Preprocess_test = Preprocessing(test_1, output_path='../../data/processed/ecfp_test_1_necessary.pkl', is_output=True, is_test=True)
# df_processed = Preprocess_test.main()
# Preprocess_test = Preprocessing(test_2, output_path='../../data/processed/ecfp_test_2_necessary.pkl', is_output=True, is_test=True)
# df_processed = Preprocess_test.main()
# Preprocess_test = Preprocessing(test_3, output_path='../../data/processed/ecfp_test_3_necessary.pkl', is_output=True, is_test=True)
# df_processed = Preprocess_test.main()

# display(test_1.head())

In [None]:
%%capture

Preprocess_test = Preprocessing(test[test['protein_name']=='BRD4'], output_path='../../data/processed/ecfp_test_BRD4_all.pkl', is_output=True, is_test=True)
df_processed = Preprocess_test.main()
Preprocess_test = Preprocessing(test[test['protein_name']=='HSA'], output_path='../../data/processed/ecfp_test_HSA_all.pkl', is_output=True, is_test=True)
df_processed = Preprocess_test.main()
Preprocess_test = Preprocessing(test[test['protein_name']=='BRD4'], output_path='../../data/processed/ecfp_test_sEH_all.pkl', is_output=True, is_test=True)
df_processed = Preprocess_test.main()