In [None]:
import sys
import os
import gc
import copy
import yaml
import pickle
import random
import joblib
import shutil
from time import time
import typing as tp
from pathlib import Path

import numpy as np
import pandas as pd
import scipy

from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
from sklearn.metrics import average_precision_score as APS
import duckdb


import torch
import torchvision
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.cuda import amp
from torch.nn import BCELoss


import timm
from mamba_ssm import Mamba
from transformers import AutoModel, AutoTokenizer

import albumentations as A
from albumentations.pytorch import ToTensorV2


# use one device only
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
con = duckdb.connect()

In [None]:
class CFG:
    NUM = 20000
    LR = 0.001
    WD = 1e-4
    NBR_FOLDS = 5
    SELECTED_FOLDS = [0, 1, 2, 3, 4]
    TRAIN_ENC_PATH = Path('../../data/external/train_enc.parquet')
    TEST_ENC_PATH = Path('../../data/external/test_enc.parquet')
    TRAIN_PATH = Path('../../data/raw/train.parquet')
    TEST_PATH = Path('../../data/raw/test.parquet')
    OUTPUT_PATH = Path(f'../../data/processed/{NUM}_50per_CLM.parquet')
    BATCH_SIZE = 128
    EPOCHS = 5
    PATIENCE = 5
    REDUCE_LR_PATIENCE = 3
    REDUCE_LR_FACTOR = 0.5

In [None]:
# train = con.query(f"""(SELECT *
#                         FROM parquet_scan('{CFG.TRAIN_PATH}')
                        
#                         LIMIT 60000)""").df()

In [None]:
train = con.query(f"""(SELECT *
                        FROM parquet_scan('{CFG.TRAIN_PATH}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT {CFG.NUM/2})
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{CFG.TRAIN_PATH}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT {CFG.NUM/2})""").df()

In [None]:
display(train.head())
display(train.tail())

In [None]:
smiles = train['molecule_smiles']#.unique()
print(len(smiles))

In [None]:
# load pre-trained ChemBERTa model checkpoint and tokenizer
cb_tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-10M-MLM')
cb_model = AutoModel.from_pretrained('DeepChem/ChemBERTa-10M-MLM')
cb_model.eval()

# tokenize SMILES
cb_encoded_inputs = cb_tokenizer(list(smiles), padding=True, truncation=True, return_tensors="pt")

# calculate embeddings
with torch.no_grad():
    outputs = cb_model(**cb_encoded_inputs)

# extract pooled output
cb_embeddings = outputs.pooler_output

cb_embeddings_df = pd.DataFrame(cb_embeddings.numpy())
cb_embeddings_df.head()

In [None]:
# df_repeated = cb_embeddings_df.loc[cb_embeddings_df.index.repeat(3)].reset_index(drop=True)

In [None]:
cb_embeddings_df = pd.concat([train['id'], cb_embeddings_df], axis=1)
binds = train[['binds', 'protein_name']]
binds['bind1'] = train.apply(lambda row: row['binds'] if row['protein_name'] == 'BRD4' else 0, axis=1)
binds['bind2'] = train.apply(lambda row: row['binds'] if row['protein_name'] == 'HSA' else 0, axis=1)
binds['bind3'] = train.apply(lambda row: row['binds'] if row['protein_name'] == 'sEH' else 0, axis=1)
cb_embeddings_df = pd.concat([cb_embeddings_df, binds], axis=1)


In [None]:
display(cb_embeddings_df.head())
display(cb_embeddings_df.tail())

In [None]:
cb_embeddings_df.to_parquet(CFG.OUTPUT_PATH)