# Install

In [1]:
!pip install fastparquet -q

# Import

In [2]:
import gc
import os
import pickle
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score as APS
import torch

In [3]:
import sys
import os
import gc
import copy
import yaml
import random
import shutil
from time import time
import typing as tp
from pathlib import Path

import numpy as np
import pandas as pd
import scipy

from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedGroupKFold

import torch
import torchvision
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.cuda import amp



import timm

import albumentations as A
from albumentations.pytorch import ToTensorV2

# use one device only
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
# class CFG:
    
#     TRAIN_ENC_PATH = Path('../../data/external/train_enc.parquet')
#     TEST_ENC_PATH = Path('../../data/external/test_enc.parquet')
#     TEST_PATH = Path('../../data/raw/test.parquet')
    
#     seed = 42
#     deterministic = True 

#     PREPROCESS = False
#     EPOCHS = 30 #20
#     BATCH_SIZE = 4096
#     LR = 1e-3
#     WD = 1e-6 

#     NBR_FOLDS = 15
#     SELECTED_FOLDS = [0]

#     SEED = 2024

# Model

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np

# Configuration (CFG) should be defined here
class CFG:
    LR = 0.001
    WD = 1e-4
    NBR_FOLDS = 5
    SELECTED_FOLDS = [0, 1, 2, 3, 4]
    TRAIN_ENC_PATH = Path('../../data/external/train_enc.parquet')
    TEST_ENC_PATH = Path('../../data/external/test_enc.parquet')
    TEST_PATH = Path('../../data/raw/test.parquet')
    BATCH_SIZE = 128
    EPOCHS = 50
    PATIENCE = 5
    REDUCE_LR_PATIENCE = 3
    REDUCE_LR_FACTOR = 0.5

# Define model
class MyModel(nn.Module):
    def __init__(self, inp_len=142, num_filters=32, hidden_dim=128, vocab_size=36, embedding_dim=128):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(embedding_dim, num_filters, kernel_size=3, padding=0)
        self.conv2 = nn.Conv1d(num_filters, num_filters*2, kernel_size=3, padding=0)
        self.conv3 = nn.Conv1d(num_filters*2, num_filters*3, kernel_size=3, padding=0)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(num_filters*3, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.output = nn.Linear(512, 3)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.global_max_pool(x).squeeze(2)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.output(x))
        return x

def get_model():
    model = MyModel()
    optimizer = optim.Adam(model.parameters(), lr=CFG.LR, weight_decay=CFG.WD)
    criterion = nn.BCELoss()
    return model, optimizer, criterion

def create_dataloader(X, y=None, batch_size=32, shuffle=False):
    X_tensor = torch.tensor(X, dtype=torch.long)
    if y is not None:
        y_tensor = torch.tensor(y, dtype=torch.float32)
        dataset = TensorDataset(X_tensor, y_tensor)
    else:
        dataset = TensorDataset(X_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [6]:
# setting seed in each env
def set_random_seed(seed: int = 42, deterministic: bool = False):
    """Set seeds"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = deterministic  # type: ignore

# function to set tensor to device
def to_device(
    tensors: tp.Union[tp.Tuple[torch.Tensor], tp.Dict[str, torch.Tensor]],
    device: torch.device, *args, **kwargs
):
    if isinstance(tensors, tuple):
        return (t.to(device, *args, **kwargs) for t in tensors)
    elif isinstance(tensors, dict):
        return {
            k: t.to(device, *args, **kwargs) for k, t in tensors.items()}
    else:
        return tensors.to(device, *args, **kwargs)

# Train / Infer

In [7]:
# Data preparation
FEATURES = [f'enc{i}' for i in range(142)]
TARGETS = ['bind1', 'bind2', 'bind3']
skf = StratifiedKFold(n_splits=CFG.NBR_FOLDS, shuffle=True, random_state=42)
train = pd.read_parquet(CFG.TRAIN_ENC_PATH)
test = pd.read_parquet(CFG.TEST_ENC_PATH)

all_preds = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[TARGETS].sum(1))):
    
    if fold not in CFG.SELECTED_FOLDS:
        continue
    
    X_train = train.loc[train_idx, FEATURES].values
    y_train = train.loc[train_idx, TARGETS].values
    X_val = train.loc[valid_idx, FEATURES].values
    y_val = train.loc[valid_idx, TARGETS].values
    
    train_loader = create_dataloader(X_train, y_train, batch_size=CFG.BATCH_SIZE, shuffle=True)
    val_loader = create_dataloader(X_val, y_val, batch_size=CFG.BATCH_SIZE, shuffle=False)
    
    model, optimizer, criterion = get_model()
    model.to(device)
    
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(CFG.EPOCHS):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)
        
        train_loss /= len(train_loader.dataset)
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        
        val_loss /= len(val_loader.dataset)
        
        print(f'Epoch {epoch+1}/{CFG.EPOCHS}, Train Loss: {train_loss}, Val Loss: {val_loss}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), f'model-{fold}.pt')
        else:
            patience_counter += 1
            if patience_counter >= CFG.PATIENCE:
                print('Early stopping triggered')
                break
        
        if patience_counter >= CFG.REDUCE_LR_PATIENCE:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= CFG.REDUCE_LR_FACTOR
            patience_counter = 0
    
    model.load_state_dict(torch.load(f'model-{fold}.pt'))
    model.eval()
    
    val_preds = []
    with torch.no_grad():
        for X_batch, _ in val_loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch)
            val_preds.append(preds.cpu().numpy())
    
    val_preds = np.concatenate(val_preds)
    print('fold:', fold, 'CV score =', APS(y_val, val_preds, average='micro'))
    
    test_loader = create_dataloader(test[FEATURES].values, batch_size=2*CFG.BATCH_SIZE, shuffle=False)
    
    test_preds = []
    with torch.no_grad():
        for X_batch, in test_loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch)
            test_preds.append(preds.cpu().numpy())
    
    test_preds = np.concatenate(test_preds)
    all_preds.append(test_preds)

preds = np.mean(all_preds, axis=0)


RuntimeError: [enforce fail at alloc_cpu.cpp:117] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 89440106368 bytes. Error code 12 (Cannot allocate memory)

# Submission

In [None]:
tst = pd.read_parquet(CFG.TEST_PATH)
tst['binds'] = 0
tst.loc[tst['protein_name']=='BRD4', 'binds'] = preds[(tst['protein_name']=='BRD4').values, 0]
tst.loc[tst['protein_name']=='HSA', 'binds'] = preds[(tst['protein_name']=='HSA').values, 1]
tst.loc[tst['protein_name']=='sEH', 'binds'] = preds[(tst['protein_name']=='sEH').values, 2]
tst[['id', 'binds']].to_csv('submission.csv', index = False)