# Initialization

This notebook is forked and inspired from the [BELKA 1DCNN Starter](https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data/notebook).

This original notebook attempted to encoded the smiles of all the train & test set and saved it locally, this may take up to 1 hour on TPU for each fold. While the original notebook only utilize one fold, there are only 1 fold training, due to the limit of computational power. Therefore, I pre-tained one model for each fold and store the model weights locally so that the prediction results could be combined to make better LB score. The models are stored [here](https://www.kaggle.com/datasets/hugowjd/1dcnn-models-for-belka-competition).

The encoded data is stored [here](https://www.kaggle.com/datasets/ahmedelfazouan/belka-enc-dataset) , 

How to improve :
* Change the fold size (better machine may deal with smaller fold number)
* Try another model like Transformer, or LSTM.
* Train for more epochs for each folds.
* Add more features like a one hot encoding of bb2 or bb3.
* And of ensembling with other models.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/1dcnn-models-for-belka-competition'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/1dcnn-models-for-belka-competition/model-21.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-19.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-1.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-24.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-12.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-9.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-2.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-13.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-4.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-22.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-15.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-7.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-0.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-14.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-11.h5
/kaggle/input/1dcnn-models-for-belka-competition/model-8.h5
/kaggle/input/1dcnn-models-for-

In [2]:
!pip install fastparquet -q

[0m

In [3]:
import gc
import os
import pickle
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score as APS

In [4]:
class CFG:

    PREPROCESS = False
    PRETRAINED = True
    EPOCHS = 20
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 0.05
    # Number of folds
    NBR_FOLDS = 15

    # Only the first fold selected
    SELECTED_FOLDS = [0,1,2,3,4]
    EXIST_MODELS = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]

    SEED = 2222

In [5]:
import tensorflow as tf
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

set_seeds(seed=CFG.SEED)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [6]:
def get_strategy():
    try:
        # Attempt to use TPU
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # This helps connect to the TPU on environments like Kaggle or Google Colab
        print("Running on TPU:", tpu.master())
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.TPUStrategy(tpu)
    except ValueError:
        # No TPU found, try GPU
        print("TPU not found. Looking for GPU.")
        if tf.config.list_physical_devices("GPU"):
            strategy = tf.distribute.MirroredStrategy()
            print("Running on GPU")
        else:
            # No GPU found, default to CPU
            print("Not on TPU or GPU. Defaulting to CPU.")
            strategy = tf.distribute.get_strategy()

    print("REPLICAS:", strategy.num_replicas_in_sync)
    return strategy

# Usage
strategy = get_strategy()

TPU not found. Looking for GPU.
Running on GPU
REPLICAS: 1


# Preprocessing

In [7]:
if CFG.PREPROCESS:
    enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}
    train_raw = pd.read_parquet('/kaggle/input/leash-BELKA/train.parquet')
    smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
    assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
    assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
    def encode_smile(smile):
        tmp = [enc[i] for i in smile]
        tmp = tmp + [0]*(142-len(tmp))
        return np.array(tmp).astype(np.uint8)

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
    train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
    train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
    train.to_parquet('train_enc.parquet')

    test_raw = pd.read_parquet('/kaggle/input/leash-BELKA/test.parquet')
    smiles = test_raw['molecule_smiles'].values

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    test = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    test.to_parquet('test_enc.parquet')

else:
    train = pd.read_parquet('/kaggle/input/belka-enc-dataset/train_enc.parquet',engine='fastparquet')
    test = pd.read_parquet('/kaggle/input/belka-enc-dataset/test_enc.parquet',engine='fastparquet')

In [13]:
print("Shape of Train set:", train.shape)
train.head(3)

Shape of Train set: (98415610, 145)


Unnamed: 0,enc0,enc1,enc2,enc3,enc4,enc5,enc6,enc7,enc8,enc9,...,enc135,enc136,enc137,enc138,enc139,enc140,enc141,bind1,bind2,bind3
0,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
1,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
2,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0


In [12]:
print("Shape of Test set:", test.shape)
test.head(3)

Shape of Test set: (1674896, 142)


Unnamed: 0,enc0,enc1,enc2,enc3,enc4,enc5,enc6,enc7,enc8,enc9,...,enc132,enc133,enc134,enc135,enc136,enc137,enc138,enc139,enc140,enc141
0,8,22,8,8,8,8,29,8,3,5,...,0,0,0,0,0,0,0,0,0,0
1,8,22,8,8,8,8,29,8,3,5,...,0,0,0,0,0,0,0,0,0,0
2,8,22,8,8,8,8,29,8,3,5,...,0,0,0,0,0,0,0,0,0,0


# Modeling

In [8]:
# 1D-CNN model
def OneDCNN_model():
    with strategy.scope():
        INP_LEN = 142
        NUM_FILTERS = 32
        hidden_dim = 128

        inputs = tf.keras.layers.Input(shape=(INP_LEN,), dtype='int32')
        x = tf.keras.layers.Embedding(input_dim=36, output_dim=hidden_dim, input_length=INP_LEN, mask_zero = True)(inputs)
        x = tf.keras.layers.Conv1D(filters=NUM_FILTERS, kernel_size=3,  activation='relu', padding='valid',  strides=1)(x)
        x = tf.keras.layers.Conv1D(filters=NUM_FILTERS*2, kernel_size=3,  activation='relu', padding='valid',  strides=1)(x)
        x = tf.keras.layers.Conv1D(filters=NUM_FILTERS*3, kernel_size=3,  activation='relu', padding='valid',  strides=1)(x)
        x = tf.keras.layers.GlobalMaxPooling1D()(x)

        x = tf.keras.layers.Dense(1024, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Dense(1024, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)

        outputs = tf.keras.layers.Dense(3, activation='sigmoid')(x)

        model = tf.keras.models.Model(inputs = inputs, outputs = outputs)
        optimizer = tf.keras.optimizers.Adam(learning_rate=CFG.LR, weight_decay = CFG.WD)
        loss = 'binary_crossentropy'
        weighted_metrics = [tf.keras.metrics.AUC(curve='PR', name = 'avg_precision')]
        model.compile(
        loss=loss,
        optimizer=optimizer,
        weighted_metrics=weighted_metrics,
        )
        return model

# Train

In [9]:
%time
FEATURES = [f'enc{i}' for i in range(142)] # The first 142 encoded columns as features
TARGETS = ['bind1', 'bind2', 'bind3'] # Three types of binds as targets
# 15 fold -> only train 1/15 of the entire dataset
# If we got better machine, we may change NBR_FOLDS smaller
skf = StratifiedKFold(n_splits = CFG.NBR_FOLDS, shuffle = True, random_state = 42) 
                                                                                    
if CFG.PRETRAINED:
    print("Model Pretrained. Skip Training process.")
else:
    for fold,(train_idx, valid_idx) in enumerate(skf.split(train, train[TARGETS].sum(1))):
        if fold in CFG.SELECTED_FOLDS:
            print(f"Working on fold {fold}")
            X_train = train.loc[train_idx, FEATURES]
            y_train = train.loc[train_idx, TARGETS]
            X_val = train.loc[valid_idx, FEATURES]
            y_val = train.loc[valid_idx, TARGETS]

            es = tf.keras.callbacks.EarlyStopping(patience=5, monitor="val_loss", mode='min', verbose=1)
            checkpoint = tf.keras.callbacks.ModelCheckpoint(monitor='val_loss', filepath=f"model-{fold}.h5",
                                                                save_best_only=True, save_weights_only=True,
                                                            mode='min')
            reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.05, patience=5, verbose=1)
            model = OneDCNN_model()
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=CFG.EPOCHS,
                callbacks=[checkpoint, reduce_lr_loss, es],
                batch_size=CFG.BATCH_SIZE,
                verbose=1,
            )


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs
Model Pretrained. Skip Training process.


# Inference

In [10]:
all_preds = []
for fold in CFG.EXIST_MODELS:
    model = OneDCNN_model()
    print(f"loading weight model-{fold}.h5")
    model.load_weights(f"/kaggle/input/1dcnn-models-for-belka-competition/model-{fold}.h5")
    preds = model.predict(test, batch_size = 2*CFG.BATCH_SIZE)
    all_preds.append(preds)
preds = np.mean(all_preds, 0)

loading weight model-0.h5
loading weight model-1.h5
loading weight model-2.h5
loading weight model-3.h5
loading weight model-4.h5
loading weight model-5.h5
loading weight model-6.h5
loading weight model-7.h5
loading weight model-8.h5
loading weight model-9.h5
loading weight model-10.h5
loading weight model-11.h5
loading weight model-12.h5
loading weight model-13.h5
loading weight model-14.h5
loading weight model-15.h5
loading weight model-16.h5
loading weight model-17.h5
loading weight model-18.h5
loading weight model-19.h5


In [11]:
print(f"Shape of Predictions: {len(preds)} * {len(preds[0])}")

Shape of Predictions: 1674896 * 3


# Plot

In [14]:
import matplotlib.pyplot as plt
# Assuming 'history' is the object returned from model.fit()
if CFG.PRETRAINED:
    print("Pretrained -> no plot")
else:
    train_loss = history.history['loss']
    val_loss = history.history.get('val_loss', [])  # Use get to avoid errors if validation loss is not available

    train_prec = history.history['avg_precision']
    val_prec = history.history.get('val_avg_precision', [])  # Similarly for validation accuracy
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(train_loss, label='Train')
    if val_loss:
        plt.plot(val_loss, label='Validation')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')

    # Plot training & validation accuracy values
    plt.subplot(1, 2, 2)
    plt.plot(train_prec, label='Train')
    if val_prec:
        plt.plot(val_prec, label='Validation')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(loc='lower right')

    plt.tight_layout()
    plt.show()

Pretrained -> no plot


# Submission

In [15]:
tst = pd.read_parquet('/kaggle/input/leash-BELKA/test.parquet')
tst['binds'] = 0
tst.loc[tst['protein_name']=='BRD4', 'binds'] = preds[(tst['protein_name']=='BRD4').values, 0]
tst.loc[tst['protein_name']=='HSA', 'binds'] = preds[(tst['protein_name']=='HSA').values, 1]
tst.loc[tst['protein_name']=='sEH', 'binds'] = preds[(tst['protein_name']=='sEH').values, 2]
tst[['id', 'binds']].to_csv('submission.csv', index = False)