In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os
import random
import sys
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from tqdm import tqdm
from sklearn.metrics import log_loss

from sklearn.decomposition import PCA

import numpy as np # linear algebra
import pandas as pd 

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


In [2]:
from sklearn.preprocessing import QuantileTransformer

In [3]:

data_dir = '../data/01_raw'
os.listdir(data_dir)

['train_targets_scored.csv',
 'sample_submission.csv',
 '.gitkeep',
 'train_drug.csv',
 'train_features.csv',
 'test_features.csv',
 'train_targets_nonscored.csv']

In [4]:
# Parameters
no_ctl = True
scale = "rankgauss"
decompo = "PCA"
ncompo_genes = 600
ncompo_cells = 50
encoding = "dummy"

In [5]:
train_features = pd.read_csv(data_dir+'/train_features.csv')
train_targets_scored = pd.read_csv(data_dir+'/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(data_dir+'/train_targets_nonscored.csv')

test_features = pd.read_csv(data_dir+'/test_features.csv')
sample_submission = pd.read_csv(data_dir+'/sample_submission.csv')
drug = pd.read_csv(data_dir+'/train_drug.csv')

In [6]:
targets_scored = train_targets_scored.columns[1:]
scored = train_targets_scored.merge(drug, on='sig_id', how='left') 

In [7]:
if no_ctl:
    # cp_type == ctl_vehicle
    print("not_ctl")
    train_features = train_features[train_features["cp_type"] != "ctl_vehicle"]
    test_features = test_features[test_features["cp_type"] != "ctl_vehicle"]
    train_targets_scored = train_targets_scored.iloc[train_features.index]
    train_targets_nonscored = train_targets_nonscored.iloc[train_features.index]
    train_features.reset_index(drop = True, inplace = True)
    train_features.reset_index(drop = True, inplace = True)
    train_targets_scored.reset_index(drop = True, inplace = True)
    train_targets_nonscored.reset_index(drop = True, inplace = True)

not_ctl


In [8]:
seed = 42


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    
seed_everything(seed)

In [9]:
# #Indiquer si valeur dans le range max, min

# import seaborn as sns
# data = pd.concat([train_features,test_features],axis=0)

# sns.distplot(data[data["cp_type"] == "ctl_vehicle"]["c-4"],label="normal")

# sns.distplot(data[data["cp_type"] == "trt_cp"]["c-4"],label="treated")
# plt.legend()
# plt.show()

In [10]:
# sum_targets = train_targets_scored[[c for c in train_targets_scored.columns if (c != "sig_id")]].sum().reset_index()

In [11]:
# sum_targets["index"] = sum_targets["index"].apply(lambda x : x.replace("_inhibitor",""))
# sum_targets["index"] = sum_targets["index"].apply(lambda x : x.replace("_activator",""))
# sum_targets["index"] = sum_targets["index"].apply(lambda x : x.replace("_agonist",""))
# sum_targets["index"] = sum_targets["index"].apply(lambda x : x.replace("_antagonist",""))

In [12]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [13]:
data_all = pd.concat([train_features, test_features], ignore_index = True)
# cols_numeric = [feat for feat in list(data_all.columns) if feat not in ["sig_id", "cp_type", "cp_time", "cp_dose"]]
# mask = (data_all[cols_numeric].var() >= variance_threshould).values
# tmp = data_all[cols_numeric].loc[:, mask]
# data_all = pd.concat([data_all[["sig_id", "cp_type", "cp_time", "cp_dose"]], tmp], axis = 1)
cols_numeric = [feat for feat in list(data_all.columns) if feat not in ["sig_id", "cp_type", "cp_time", "cp_dose"]]

In [14]:
# from scipy import stats

# train_features[GENES].apply(lambda x : stats.moment(x,moment=5),axis=1)

In [15]:
for col in (cols_numeric):

    transformer = QuantileTransformer(n_quantiles=100,random_state=seed, output_distribution="normal")
    vec_len = len(data_all[col].values)
    raw_vec = data_all[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    data_all[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]

In [16]:
#PCA
if decompo == "PCA":
    print("PCA")
    GENES = [col for col in data_all.columns if col.startswith("g-")]
    CELLS = [col for col in data_all.columns if col.startswith("c-")]
    
    pca_genes = PCA(n_components = ncompo_genes,
                    random_state = seed).fit_transform(data_all[GENES])
    pca_cells = PCA(n_components = ncompo_cells,
                    random_state = seed).fit_transform(data_all[CELLS])
    
    pca_genes = pd.DataFrame(pca_genes, columns = [f"pca_g-{i}" for i in range(ncompo_genes)])
    pca_cells = pd.DataFrame(pca_cells, columns = [f"pca_c-{i}" for i in range(ncompo_cells)])
    data_all = pd.concat([data_all, pca_genes, pca_cells], axis = 1)
else:
    pass

PCA


In [17]:
# Encoding
if encoding == "lb":
    print("Label Encoding")
    for feat in ["cp_time", "cp_dose"]:
        data_all[feat] = LabelEncoder().fit_transform(data_all[feat])
elif encoding == "dummy":
    print("One-Hot")
    data_all = pd.get_dummies(data_all, columns = ["cp_time", "cp_dose"])

One-Hot


In [18]:
GENES = [col for col in data_all.columns if col.startswith("g-")]
CELLS = [col for col in data_all.columns if col.startswith("c-")]

#for stats in ["sum", "mean", "std", "kurt", "skew"]:
for stats in ["sum",  "std", "mean","kurt", "skew"]:
    data_all["g_" + stats] = getattr(data_all[GENES], stats)(axis = 1)
    data_all["c_" + stats] = getattr(data_all[CELLS], stats)(axis = 1)

for stats in ["sum", "mean", "std", "kurt", "skew"]:
    data_all["gc_" + stats] = getattr(data_all[GENES + CELLS], stats)(axis = 1)

In [19]:
from sklearn.feature_selection import VarianceThreshold

feat_cols = [ c for c in data_all.columns if c not in ["sig_id", "cp_type"]]
var_thresh = VarianceThreshold(0.8)  #<-- Update
data_feats = pd.DataFrame(var_thresh.fit_transform(data_all[feat_cols]))

In [20]:
data_all = pd.concat([data_all["sig_id"],data_feats],axis=1)

In [21]:
# from sklearn.cluster import KMeans
# distortion = []
# for k in range(1,10):
#     kmeans = KMeans(n_clusters = k, random_state = seed).fit(data_all)
#     distortion += [kmeans.inertia_]

In [22]:
# import matplotlib.pyplot as plt
# plt.plot(range(1,10),distortion)

In [23]:
#Kmeans
from sklearn.cluster import KMeans
def fe_cluster2(data, n_clusters = 3, seed = 42):
    kmeans = KMeans(n_clusters = n_clusters, random_state = seed).fit(data.iloc[:,1:])
    data['cluster'] = kmeans.labels_
    data = pd.get_dummies(data, columns = ['cluster'])
    return data
    
# data_all=fe_cluster2(data_all,n_clusters=3,seed=42)

In [24]:
# train_df and test_df
try:
    train_targets_scored.drop("sig_id", axis = 1, inplace = True)
    train_targets_nonscored.drop("sig_id", axis = 1, inplace = True)
except:
    pass
train_df = data_all[: train_features.shape[0]]
train_df.reset_index(drop = True, inplace = True)
# The following line it's a bad practice in my opinion, targets on train set
#train_df = pd.concat([train_df, targets], axis = 1)
test_df = data_all[train_df.shape[0]: ]
test_df.reset_index(drop = True, inplace = True)

In [25]:
print(f"train_df.shape: {train_df.shape}")
print(f"test_df.shape: {test_df.shape}")

train_df.shape: (21948, 1047)
test_df.shape: (3624, 1047)


In [26]:
X_test = test_df.values
print(f"X_test.shape: {X_test.shape}")

X_test.shape: (3624, 1047)


In [27]:
import torch
import numpy as np
from scipy.sparse import csc_matrix
import time
from abc import abstractmethod
#from pytorch_tabnet import tab_network
from pytorch_tabnet.multiclass_utils import unique_labels
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score
from torch.nn.utils import clip_grad_norm_
from pytorch_tabnet.utils import (PredictDataset,
                                  create_explain_matrix)
from sklearn.base import BaseEstimator
from torch.utils.data import DataLoader
from copy import deepcopy
import io
import json
from pathlib import Path
import shutil
import zipfile

In [28]:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

def log_loss_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        results[i] = log_loss_score(y_true[:,i], y_pred[:,i])
    return results.mean()

def log_loss_score(actual, predicted,  eps=1e-15):

        """
        :param predicted:   The predicted probabilities as floats between 0-1
        :param actual:      The binary labels. Either 0 or 1.
        :param eps:         Log(0) is equal to infinity, so we need to offset our predicted values slightly by eps from 0 or 1
        :return:            The logarithmic loss between between the predicted probability assigned to the possible outcomes for item i, and the actual outcome.
        """

        
        p1 = actual * np.log(predicted+eps)
        p0 = (1-actual) * np.log(1-predicted+eps)
        loss = p0 + p1

        return -loss.mean()

In [29]:
class TrainDataset(Dataset):
    """
    Format for numpy array
    Parameters
    ----------
    X : 2D array
        The input matrix
    y : 2D array
        The one-hot encoded target
    """

    def __init__(self, x, y1,y2=None):
        self.x = x
        self.y1 = y1
        self.y2 = y2

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        x, y1,y2 = self.x[index], self.y1[index], self.y2[index]
        return x, y1,y2

class ValidDataset(Dataset):
    """
    Format for numpy array
    Parameters
    ----------
    X : 2D array
        The input matrix
    y : 2D array
        The one-hot encoded target
    """

    def __init__(self, x, y1):
        self.x = x
        self.y1 = y1

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        x, y1 = self.x[index], self.y1[index]
        return x, y1

In [30]:
def create_dataloaders(
    X_train, y_scored_train,y_nscored_train, X_valid,y_valid, weights, batch_size, num_workers, drop_last, pin_memory=True
):
    """
    Create dataloaders with or wihtout subsampling depending on weights and balanced.
    Parameters
    ----------
    X_train : np.ndarray
        Training data
    y_train : np.array
        Mapped Training targets
    eval_set : list of tuple
        List of eval tuple set (X, y)
    weights : either 0, 1, dict or iterable
        if 0 (default) : no weights will be applied
        if 1 : classification only, will balanced class with inverse frequency
        if dict : keys are corresponding class values are sample weights
        if iterable : list or np array must be of length equal to nb elements
                      in the training set
    batch_size : int
        how many samples per batch to load
    num_workers : int
        how many subprocesses to use for data loading. 0 means that the data
        will be loaded in the main process
    drop_last : bool
        set to True to drop the last incomplete batch, if the dataset size is not
        divisible by the batch size. If False and the size of dataset is not
        divisible by the batch size, then the last batch will be smaller
    pin_memory : bool
        Whether to pin GPU memory during training
    Returns
    -------
    train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader
        Training and validation dataloaders
    """

    if isinstance(weights, int):
        if weights == 0:
            need_shuffle = True
            sampler = None
        elif weights == 1:
            need_shuffle = False
            class_sample_count = np.array(
                [len(np.where(y_train == t)[0]) for t in np.unique(y_scored_train)]
            )

            weights = 1.0 / class_sample_count

            samples_weight = np.array([weights[t] for t in y_scored_train])

            samples_weight = torch.from_numpy(samples_weight)
            samples_weight = samples_weight.double()
            sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
        else:
            raise ValueError("Weights should be either 0, 1, dictionnary or list.")
    elif isinstance(weights, dict):
        # custom weights per class
        need_shuffle = False
        samples_weight = np.array([weights[t] for t in y_scored_train])
        sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    else:
        # custom weights
        if len(weights) != len(y_scored_train):
            raise ValueError("Custom weights should match number of train samples.")
        need_shuffle = False
        samples_weight = np.array(weights)
        sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    train_dataloader = DataLoader(
        TrainDataset(X_train.astype(np.float32), y_scored_train, y_nscored_train),
        batch_size=batch_size,
        sampler=sampler,
        shuffle=need_shuffle,
        num_workers=num_workers,
        drop_last=drop_last,
        pin_memory=pin_memory
    )

    # valid_dataloaders = []
    # for X, y in [(X_valid,y_valid)]:
    #     valid_dataloaders.append(
    #         DataLoader(
    #             ValidDataset(X.astype(np.float32), y),
    #             batch_size=batch_size,
    #             shuffle=False,
    #             num_workers=num_workers,
    #             pin_memory=pin_memory
    #         )
    #     )

    valid_dataloaders = DataLoader(
        ValidDataset(X_valid.astype(np.float32), y_valid),
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory
    )


    return train_dataloader, valid_dataloaders

In [31]:
import torch
from torch.nn import Linear, BatchNorm1d, ReLU, PReLU,LeakyReLU
import numpy as np
from pytorch_tabnet import sparsemax


def initialize_non_glu(module, input_dim, output_dim):
    gain_value = np.sqrt((input_dim+output_dim)/np.sqrt(4*input_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
    # torch.nn.init.zeros_(module.bias)
    return


def initialize_glu(module, input_dim, output_dim):
    gain_value = np.sqrt((input_dim+output_dim)/np.sqrt(input_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
    # torch.nn.init.zeros_(module.bias)
    return


class GBN(torch.nn.Module):
    """
        Ghost Batch Normalization
        https://arxiv.org/abs/1705.08741
    """

    def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01):
        super(GBN, self).__init__()

        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = BatchNorm1d(self.input_dim, momentum=momentum)

    def forward(self, x):
        chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
        res = [self.bn(x_) for x_ in chunks]

        return torch.cat(res, dim=0)


class TabNetNoEmbeddings(torch.nn.Module):
    def __init__(self, input_dim, output_dim1,output_dim2,
                 n_d=8, n_a=8,
                 n_steps=3, gamma=1.3,
                 n_independent=2, n_shared=2, epsilon=1e-15,
                 virtual_batch_size=128, momentum=0.02,
                 mask_type="sparsemax"):
        """
        Defines main part of the TabNet network without the embedding layers.
        Parameters
        ----------
        input_dim : int
            Number of features
        output_dim : int or list of int for multi task classification
            Dimension of network output
            examples : one for regression, 2 for binary classification etc...
        n_d : int
            Dimension of the prediction  layer (usually between 4 and 64)
        n_a : int
            Dimension of the attention  layer (usually between 4 and 64)
        n_steps : int
            Number of sucessive steps in the newtork (usually betwenn 3 and 10)
        gamma : float
            Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0)
        n_independent : int
            Number of independent GLU layer in each GLU block (default 2)
        n_shared : int
            Number of independent GLU layer in each GLU block (default 2)
        epsilon : float
            Avoid log(0), this should be kept very low
        virtual_batch_size : int
            Batch size for Ghost Batch Normalization
        momentum : float
            Float value between 0 and 1 which will be used for momentum in all batch norm
        mask_type : str
            Either "sparsemax" or "entmax" : this is the masking function to use
        """
        super(TabNetNoEmbeddings, self).__init__()
        self.input_dim = input_dim
        self.output_dim1 = output_dim1
        self.output_dim2 = output_dim2
        self.is_multi_task = isinstance(output_dim1, list)
        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_independent = n_independent
        self.n_shared = n_shared
        self.virtual_batch_size = virtual_batch_size
        self.mask_type = mask_type
        self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01)

        if self.n_shared > 0:
            shared_feat_transform = torch.nn.ModuleList()
            for i in range(self.n_shared):
                if i == 0:
                    shared_feat_transform.append(Linear(self.input_dim,
                                                        2*(n_d + n_a),
                                                        bias=False))
                else:
                    shared_feat_transform.append(Linear(n_d + n_a, 2*(n_d + n_a), bias=False))

        else:
            shared_feat_transform = None

        self.initial_splitter = FeatTransformer(self.input_dim, n_d+n_a, shared_feat_transform,
                                                n_glu_independent=self.n_independent,
                                                virtual_batch_size=self.virtual_batch_size,
                                                momentum=momentum)

        self.feat_transformers = torch.nn.ModuleList()
        self.att_transformers = torch.nn.ModuleList()

        for step in range(n_steps):
            transformer = FeatTransformer(self.input_dim, n_d+n_a, shared_feat_transform,
                                          n_glu_independent=self.n_independent,
                                          virtual_batch_size=self.virtual_batch_size,
                                          momentum=momentum)
            attention = AttentiveTransformer(n_a, self.input_dim,
                                             virtual_batch_size=self.virtual_batch_size,
                                             momentum=momentum,
                                             mask_type=self.mask_type)
            self.feat_transformers.append(transformer)
            self.att_transformers.append(attention)

        if self.is_multi_task:
            self.multi_task_mappings1 = torch.nn.ModuleList()
            for task_dim in output_dim1:
                task_mapping = Linear(n_d, task_dim, bias=False)
                initialize_non_glu(task_mapping, n_d, task_dim)
                self.multi_task_mappings1.append(task_mapping)

            # self.multi_task_mappings2 = torch.nn.ModuleList()
            # for task_dim in output_dim2:
            #     task_mapping = Linear(n_d, task_dim, bias=False)
            #     initialize_non_glu(task_mapping, n_d, task_dim)
            #     self.multi_task_mappings2.append(task_mapping)
        else:
            self.final_mapping1 = Linear(n_d, output_dim1, bias=False)
            initialize_non_glu(self.final_mapping1, n_d, output_dim1)
            # self.final_mapping2 = Linear(n_d, output_dim2, bias=False)
            # initialize_non_glu(self.final_mapping2, n_d, output_dim2)

    def forward(self, x):
        res = 0
        x = self.initial_bn(x)

        prior = torch.ones(x.shape).to(x.device)
        M_loss = 0
        att = self.initial_splitter(x)[:, self.n_d:]

        for step in range(self.n_steps):
            M = self.att_transformers[step](prior, att)
            M_loss += torch.mean(torch.sum(torch.mul(M, torch.log(M+self.epsilon)),
                                           dim=1))
            # update prior
            prior = torch.mul(self.gamma - M, prior)
            # output
            masked_x = torch.mul(M, x)
            out = self.feat_transformers[step](masked_x)
            d = ReLU()(out[:, :self.n_d])
            res = torch.add(res, d)
            # update attention
            att = out[:, self.n_d:]

        M_loss /= self.n_steps

        if self.is_multi_task:
            # Result will be in list format
            out1 = []
            for task_mapping in self.multi_task_mappings1:
                out1.append(task_mapping(res))
            # out2 = []
            # for task_mapping in self.multi_task_mappings2:
            #     out2.append(task_mapping(res))
        else:
            out1 = self.final_mapping1(res)
            # out2 = self.final_mapping2(res)
        # return out1,out2, M_loss
        return out1, M_loss

    def forward_masks(self, x):
        x = self.initial_bn(x)

        prior = torch.ones(x.shape).to(x.device)
        M_explain = torch.zeros(x.shape).to(x.device)
        att = self.initial_splitter(x)[:, self.n_d:]
        masks = {}

        for step in range(self.n_steps):
            M = self.att_transformers[step](prior, att)
            masks[step] = M
            # update prior
            prior = torch.mul(self.gamma - M, prior)
            # output
            masked_x = torch.mul(M, x)
            out = self.feat_transformers[step](masked_x)
            d = ReLU()(out[:, :self.n_d])
            # explain
            step_importance = torch.sum(d, dim=1)
            M_explain += torch.mul(M, step_importance.unsqueeze(dim=1))
            # update attention
            att = out[:, self.n_d:]

        return M_explain, masks


class TabNet(torch.nn.Module):
    def __init__(self, input_dim, output_dim1,output_dim2, n_d=8, n_a=8,
                 n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1,
                 n_independent=2, n_shared=2, epsilon=1e-15,
                 virtual_batch_size=128, momentum=0.02, device_name='auto',
                 mask_type="sparsemax"):
        """
        Defines TabNet network
        Parameters
        ----------
        input_dim : int
            Initial number of features
        output_dim : int
            Dimension of network output
            examples : one for regression, 2 for binary classification etc...
        n_d : int
            Dimension of the prediction  layer (usually between 4 and 64)
        n_a : int
            Dimension of the attention  layer (usually between 4 and 64)
        n_steps : int
            Number of sucessive steps in the newtork (usually betwenn 3 and 10)
        gamma : float
            Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0)
        cat_idxs : list of int
            Index of each categorical column in the dataset
        cat_dims : list of int
            Number of categories in each categorical column
        cat_emb_dim : int or list of int
            Size of the embedding of categorical features
            if int, all categorical features will have same embedding size
            if list of int, every corresponding feature will have specific size
        n_independent : int
            Number of independent GLU layer in each GLU block (default 2)
        n_shared : int
            Number of independent GLU layer in each GLU block (default 2)
        epsilon : float
            Avoid log(0), this should be kept very low
        virtual_batch_size : int
            Batch size for Ghost Batch Normalization
        momentum : float
            Float value between 0 and 1 which will be used for momentum in all batch norm
        device_name : {'auto', 'cuda', 'cpu'}
        mask_type : str
            Either "sparsemax" or "entmax" : this is the masking function to use
        """
        super(TabNet, self).__init__()
        self.cat_idxs = cat_idxs or []
        self.cat_dims = cat_dims or []
        self.cat_emb_dim = cat_emb_dim

        self.input_dim = input_dim
        self.output_dim1 = output_dim1
        self.output_dim2 = output_dim2
        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_independent = n_independent
        self.n_shared = n_shared
        self.mask_type = mask_type

        if self.n_steps <= 0:
            raise ValueError("n_steps should be a positive integer.")
        if self.n_independent == 0 and self.n_shared == 0:
            raise ValueError("n_shared and n_independant can't be both zero.")

        self.virtual_batch_size = virtual_batch_size
        self.embedder = EmbeddingGenerator(input_dim, cat_dims, cat_idxs, cat_emb_dim)
        self.post_embed_dim = self.embedder.post_embed_dim
        self.tabnet = TabNetNoEmbeddings(self.post_embed_dim, output_dim1,output_dim2, n_d, n_a, n_steps,
                                         gamma, n_independent, n_shared, epsilon,
                                         virtual_batch_size, momentum, mask_type)

        # Defining device
        if device_name == 'auto':
            if torch.cuda.is_available():
                device_name = 'cuda'
            else:
                device_name = 'cpu'
        self.device = torch.device(device_name)
        self.to(self.device)

    def forward(self, x):
        x = self.embedder(x)
        return self.tabnet(x)

    def forward_masks(self, x):
        x = self.embedder(x)
        return self.tabnet.forward_masks(x)


class AttentiveTransformer(torch.nn.Module):
    def __init__(self, input_dim, output_dim,
                 virtual_batch_size=128,
                 momentum=0.02,
                 mask_type="sparsemax"):
        """
        Initialize an attention transformer.
        Parameters
        ----------
        input_dim : int
            Input size
        output_dim : int
            Outpu_size
        virtual_batch_size : int
            Batch size for Ghost Batch Normalization
        momentum : float
            Float value between 0 and 1 which will be used for momentum in batch norm
        mask_type : str
            Either "sparsemax" or "entmax" : this is the masking function to use
        """
        super(AttentiveTransformer, self).__init__()
        self.fc = Linear(input_dim, output_dim, bias=False)
        initialize_non_glu(self.fc, input_dim, output_dim)
        self.bn = GBN(output_dim, virtual_batch_size=virtual_batch_size,
                      momentum=momentum)

        if mask_type == "sparsemax":
            # Sparsemax
            self.selector = sparsemax.Sparsemax(dim=-1)
        elif mask_type == "entmax":
            # Entmax
            self.selector = sparsemax.Entmax15(dim=-1)
        else:
            raise NotImplementedError("Please choose either sparsemax" +
                                      "or entmax as masktype")

    def forward(self, priors, processed_feat):
        x = self.fc(processed_feat)
        x = self.bn(x)
        x = torch.mul(x, priors)
        x = self.selector(x)
        return x


class FeatTransformer(torch.nn.Module):
    def __init__(self, input_dim, output_dim, shared_layers, n_glu_independent,
                 virtual_batch_size=128, momentum=0.02):
        super(FeatTransformer, self).__init__()
        """
        Initialize a feature transformer.
        Parameters
        ----------
        input_dim : int
            Input size
        output_dim : int
            Outpu_size
        shared_layers : torch.nn.ModuleList
            The shared block that should be common to every step
        n_glu_independant : int
            Number of independent GLU layers
        virtual_batch_size : int
            Batch size for Ghost Batch Normalization within GLU block(s)
        momentum : float
            Float value between 0 and 1 which will be used for momentum in batch norm
        """

        params = {
            'n_glu': n_glu_independent,
            'virtual_batch_size': virtual_batch_size,
            'momentum': momentum
        }

        if shared_layers is None:
            # no shared layers
            self.shared = torch.nn.Identity()
            is_first = True
        else:
            self.shared = GLU_Block(input_dim, output_dim,
                                    first=True,
                                    shared_layers=shared_layers,
                                    n_glu=len(shared_layers),
                                    virtual_batch_size=virtual_batch_size,
                                    momentum=momentum)
            is_first = False

        if n_glu_independent == 0:
            # no independent layers
            self.specifics = torch.nn.Identity()
        else:
            spec_input_dim = input_dim if is_first else output_dim
            self.specifics = GLU_Block(spec_input_dim, output_dim,
                                       first=is_first,
                                       **params)

    def forward(self, x):
        x = self.shared(x)
        x = self.specifics(x)
        return x


class GLU_Block(torch.nn.Module):
    """
        Independant GLU block, specific to each step
    """

    def __init__(self, input_dim, output_dim, n_glu=2, first=False, shared_layers=None,
                 virtual_batch_size=128, momentum=0.02):
        super(GLU_Block, self).__init__()
        self.first = first
        self.shared_layers = shared_layers
        self.n_glu = n_glu
        self.glu_layers = torch.nn.ModuleList()

        params = {
            'virtual_batch_size': virtual_batch_size,
            'momentum': momentum
        }

        fc = shared_layers[0] if shared_layers else None
        self.glu_layers.append(GLU_Layer(input_dim, output_dim,
                                         fc=fc,
                                         **params))
        for glu_id in range(1, self.n_glu):
            fc = shared_layers[glu_id] if shared_layers else None
            self.glu_layers.append(GLU_Layer(output_dim, output_dim,
                                             fc=fc,
                                             **params))

    def forward(self, x):
        scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device))
        if self.first:  # the first layer of the block has no scale multiplication
            x = self.glu_layers[0](x)
            layers_left = range(1, self.n_glu)
        else:
            layers_left = range(self.n_glu)

        for glu_id in layers_left:
            x = torch.add(x, self.glu_layers[glu_id](x))
            x = x*scale
        return x


class GLU_Layer(torch.nn.Module):
    def __init__(self, input_dim, output_dim, fc=None,
                 virtual_batch_size=128, momentum=0.02):
        super(GLU_Layer, self).__init__()

        self.output_dim = output_dim
        if fc:
            self.fc = fc
        else:
            self.fc = Linear(input_dim, 2*output_dim, bias=False)
        initialize_glu(self.fc, input_dim, 2*output_dim)

        self.bn = GBN(2*output_dim, virtual_batch_size=virtual_batch_size,
                      momentum=momentum)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        out = torch.mul(x[:, :self.output_dim], torch.sigmoid(x[:, self.output_dim:]))
        return out


class EmbeddingGenerator(torch.nn.Module):
    """
        Classical embeddings generator
    """

    def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dim):
        """ This is an embedding module for an entier set of features
        Parameters
        ----------
        input_dim : int
            Number of features coming as input (number of columns)
        cat_dims : list of int
            Number of modalities for each categorial features
            If the list is empty, no embeddings will be done
        cat_idxs : list of int
            Positional index for each categorical features in inputs
        cat_emb_dim : int or list of int
            Embedding dimension for each categorical features
            If int, the same embdeding dimension will be used for all categorical features
        """
        super(EmbeddingGenerator, self).__init__()
        if cat_dims == [] or cat_idxs == []:
            self.skip_embedding = True
            self.post_embed_dim = input_dim
            return

        self.skip_embedding = False
        if isinstance(cat_emb_dim, int):
            self.cat_emb_dims = [cat_emb_dim]*len(cat_idxs)
        else:
            self.cat_emb_dims = cat_emb_dim

        # check that all embeddings are provided
        if len(self.cat_emb_dims) != len(cat_dims):
            msg = """ cat_emb_dim and cat_dims must be lists of same length, got {len(self.cat_emb_dims)}
                      and {len(cat_dims)}"""
            raise ValueError(msg)
        self.post_embed_dim = int(input_dim + np.sum(self.cat_emb_dims) - len(self.cat_emb_dims))

        self.embeddings = torch.nn.ModuleList()

        # Sort dims by cat_idx
        sorted_idxs = np.argsort(cat_idxs)
        cat_dims = [cat_dims[i] for i in sorted_idxs]
        self.cat_emb_dims = [self.cat_emb_dims[i] for i in sorted_idxs]

        for cat_dim, emb_dim in zip(cat_dims, self.cat_emb_dims):
            self.embeddings.append(torch.nn.Embedding(cat_dim, emb_dim))

        # record continuous indices
        self.continuous_idx = torch.ones(input_dim, dtype=torch.bool)
        self.continuous_idx[cat_idxs] = 0

    def forward(self, x):
        """
        Apply embdeddings to inputs
        Inputs should be (batch_size, input_dim)
        Outputs will be of size (batch_size, self.post_embed_dim)
        """
        if self.skip_embedding:
            # no embeddings required
            return x

        cols = []
        cat_feat_counter = 0
        for feat_init_idx, is_continuous in enumerate(self.continuous_idx):
            # Enumerate through continuous idx boolean mask to apply embeddings
            if is_continuous:
                cols.append(x[:, feat_init_idx].float().view(-1, 1))
            else:
                cols.append(self.embeddings[cat_feat_counter](x[:, feat_init_idx].long()))
                cat_feat_counter += 1
        # concat
        post_embeddings = torch.cat(cols, dim=1)
        return post_embeddings

In [32]:
class TabModel(BaseEstimator):
    def __init__(self, n_d=8, n_a=8, n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1,
                 n_independent=2, n_shared=2, epsilon=1e-15,  momentum=0.02,
                 lambda_sparse=1e-3, seed=0,
                 clip_value=1, verbose=1,
                 optimizer_fn=torch.optim.Adam,
                 optimizer_params=dict(lr=2e-2),
                 scheduler_params=None, scheduler_fn=None,
                 mask_type="sparsemax",
                 input_dim=None, output_dim1=None,output_dim2=None,
                 device_name='auto'):
        """ Class for TabNet model
        Parameters
        ----------
            device_name: str
                'cuda' if running on GPU, 'cpu' if not, 'auto' to autodetect
        """

        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.cat_idxs = cat_idxs
        self.cat_dims = cat_dims
        self.cat_emb_dim = cat_emb_dim
        self.n_independent = n_independent
        self.n_shared = n_shared
        self.epsilon = epsilon
        self.momentum = momentum
        self.lambda_sparse = lambda_sparse
        self.clip_value = clip_value
        self.verbose = verbose
        self.optimizer_fn = optimizer_fn
        self.optimizer_params = optimizer_params
        self.device_name = device_name
        self.scheduler_params = scheduler_params
        self.scheduler_fn = scheduler_fn
        self.mask_type = mask_type
        self.input_dim = input_dim
        self.output_dim1 = output_dim1
        self.output_dim2 = output_dim2

        #self.batch_size = 1024
        self.batch_size = 2048

        self.seed = seed
        torch.manual_seed(self.seed)
        # Defining device
        if device_name == 'auto':
            if torch.cuda.is_available():
                device_name = 'cuda'
            else:
                device_name = 'cpu'
        self.device = torch.device(device_name)
        print(f"Device used : {self.device}")

    @abstractmethod
    def construct_loaders(self, X_train, y_scored_train,y_nscored_train, X_valid, y_valid,
                          weights, batch_size, num_workers, drop_last):
        """
        Returns
        -------
        train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader
            Training and validation dataloaders
        -------
        """
        raise NotImplementedError('users must define construct_loaders to use this base class')

    def init_network(
                     self,
                     input_dim,
                     output_dim1,
                     output_dim2,
                     n_d,
                     n_a,
                     n_steps,
                     gamma,
                     cat_idxs,
                     cat_dims,
                     cat_emb_dim,
                     n_independent,
                     n_shared,
                     epsilon,
                     virtual_batch_size,
                     momentum,
                     device_name,
                     mask_type,
                     ):
        self.network = TabNet(
            input_dim,
            output_dim1,
            output_dim2,
            n_d=n_d,
            n_a=n_a,
            n_steps=n_steps,
            gamma=gamma,
            cat_idxs=cat_idxs,
            cat_dims=cat_dims,
            cat_emb_dim=cat_emb_dim,
            n_independent=n_independent,
            n_shared=n_shared,
            epsilon=epsilon,
            virtual_batch_size=virtual_batch_size,
            momentum=momentum,
            device_name=device_name,
            mask_type=mask_type).to(self.device)

        self.reducing_matrix = create_explain_matrix(
            self.network.input_dim,
            self.network.cat_emb_dim,
            self.network.cat_idxs,
            self.network.post_embed_dim)

    def fit(self, X_train, y_scored_train, y_nscored_train, X_valid=None, y_valid=None, loss_fn=None,loss_tr=None,
            weights=0, max_epochs=100, patience=10, batch_size=1024,
            virtual_batch_size=128, num_workers=0, drop_last=False):
        """Train a neural network stored in self.network
        Using train_dataloader for training data and
        valid_dataloader for validation.
        Parameters
        ----------
            X_train: np.ndarray
                Train set
            y_train : np.array
                Train targets
            X_train: np.ndarray
                Train set
            y_train : np.array
                Train targets
            weights : bool or dictionnary
                0 for no balancing
                1 for automated balancing
                dict for custom weights per class
            max_epochs : int
                Maximum number of epochs during training
            patience : int
                Number of consecutive non improving epoch before early stopping
            batch_size : int
                Training batch size
            virtual_batch_size : int
                Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size)
            num_workers : int
                Number of workers used in torch.utils.data.DataLoader
            drop_last : bool
                Whether to drop last batch during training
        """
        # update model name

        self.update_fit_params(X_train, y_scored_train,y_nscored_train, X_valid, y_valid, loss_fn,loss_tr,
                               weights, max_epochs, patience, batch_size,virtual_batch_size, num_workers, drop_last)


        train_dataloader, valid_dataloader = self.construct_loaders(X_train,
                                                                    y_scored_train,
                                                                    y_nscored_train,
                                                                    X_valid,
                                                                    y_valid,
                                                                    self.updated_weights,
                                                                    self.batch_size,
                                                                    self.num_workers,
                                                                    self.drop_last)

        self.init_network(
            input_dim=self.input_dim,
            output_dim1=self.output_dim1,
            output_dim2=self.output_dim2,
            n_d=self.n_d,
            n_a=self.n_a,
            n_steps=self.n_steps,
            gamma=self.gamma,
            cat_idxs=self.cat_idxs,
            cat_dims=self.cat_dims,
            cat_emb_dim=self.cat_emb_dim,
            n_independent=self.n_independent,
            n_shared=self.n_shared,
            epsilon=self.epsilon,
            virtual_batch_size=self.virtual_batch_size,
            momentum=self.momentum,
            device_name=self.device_name,
            mask_type=self.mask_type
        )

        self.optimizer = self.optimizer_fn(self.network.parameters(),
                                           **self.optimizer_params)

        if self.scheduler_fn:
            self.scheduler = self.scheduler_fn(self.optimizer, **self.scheduler_params)
        else:
            self.scheduler = None

        self.losses_train = []
        self.losses_valid = []
        self.learning_rates = []
        self.metrics_train = []
        self.metrics_valid = []

        if self.verbose > 0:
            print("Will train until validation stopping metric",
                  f"hasn't improved in {self.patience} rounds.")
            msg_epoch = f'| EPOCH |  train  |   valid  | total time (s)'
            print('---------------------------------------')
            print(msg_epoch)

        total_time = 0
        while (self.epoch < self.max_epochs and
               self.patience_counter < self.patience):
            starting_time = time.time()
            # updates learning rate history
            self.learning_rates.append(self.optimizer.param_groups[-1]["lr"])

            fit_metrics = self.fit_epoch(train_dataloader, valid_dataloader)

            # leaving it here, may be used for callbacks later
            self.losses_train.append(fit_metrics['train']['loss_avg'])
            self.losses_valid.append(fit_metrics['valid']['total_loss'])
            self.metrics_train.append(fit_metrics['train']['stopping_loss'])
            self.metrics_valid.append(fit_metrics['valid']['stopping_loss'])

            stopping_loss = fit_metrics['valid']['stopping_loss']
            if stopping_loss < self.best_cost:
                self.best_cost = stopping_loss
                self.patience_counter = 0
                # Saving model
                self.best_network = deepcopy(self.network)
                has_improved = True
            else:
                self.patience_counter += 1
                has_improved=False
            self.epoch += 1
            total_time += time.time() - starting_time
            if self.verbose > 0:
                if self.epoch % self.verbose == 0:
                    separator = "|"
                    msg_epoch = f"| {self.epoch:<5} | "
                    msg_epoch += f" {fit_metrics['train']['stopping_loss']:.5f}"
                    msg_epoch += f' {separator:<2} '
                    msg_epoch += f" {fit_metrics['valid']['stopping_loss']:.5f}"
                    msg_epoch += f' {separator:<2} '
                    msg_epoch += f" {np.round(total_time, 1):<10}"
                    msg_epoch += f" {has_improved}"
                    print(msg_epoch)

        if self.verbose > 0:
            if self.patience_counter == self.patience:
                print(f"Early stopping occured at epoch {self.epoch}")
            print(f"Training done in {total_time:.3f} seconds.")
            print('---------------------------------------')

        self.history = {"train": {"loss": self.losses_train,
                                  "metric": self.metrics_train,
                                  "lr": self.learning_rates},
                        "valid": {"loss": self.losses_valid,
                                  "metric": self.metrics_valid}}
        # load best models post training
        self.load_best_model()

        # compute feature importance once the best model is defined
        self._compute_feature_importances(train_dataloader)

    def save_model(self, path):
        """
        Saving model with two distinct files.
        """
        saved_params = {}
        for key, val in self.get_params().items():
            if isinstance(val, type):
                # Don't save torch specific params
                continue
            else:
                saved_params[key] = val

        # Create folder
        Path(path).mkdir(parents=True, exist_ok=True)

        # Save models params
        with open(Path(path).joinpath("model_params.json"), "w", encoding="utf8") as f:
            json.dump(saved_params, f)

        # Save state_dict
        torch.save(self.network.state_dict(), Path(path).joinpath("network.pt"))
        shutil.make_archive(path, 'zip', path)
        shutil.rmtree(path)
        print(f"Successfully saved model at {path}.zip")
        return f"{path}.zip"

    def load_model(self, filepath):

        try:
            try:
                with zipfile.ZipFile(filepath) as z:
                    with z.open("model_params.json") as f:
                        loaded_params = json.load(f)
                    with z.open("network.pt") as f:
                        try:
                            saved_state_dict = torch.load(f)
                        except io.UnsupportedOperation:
                            # In Python <3.7, the returned file object is not seekable (which at least
                            # some versions of PyTorch require) - so we'll try buffering it in to a
                            # BytesIO instead:
                            saved_state_dict = torch.load(io.BytesIO(f.read()))
                            
            except:
                with open(os.path.join(filepath, "model_params.json")) as f:
                        loaded_params = json.load(f)

                saved_state_dict = torch.load(os.path.join(filepath, "network.pt"), map_location="cpu")
 
        except KeyError:
            raise KeyError("Your zip file is missing at least one component")

        #print(loaded_params)
        if torch.cuda.is_available():
            device_name = 'cuda'
        else:
            device_name = 'cpu'
        loaded_params["device_name"] = device_name
        self.__init__(**loaded_params)
        
        

        self.init_network(
            input_dim=self.input_dim,
            output_dim1=self.output_dim1,
            output_dim2=self.output_dim2,
            n_d=self.n_d,
            n_a=self.n_a,
            n_steps=self.n_steps,
            gamma=self.gamma,
            cat_idxs=self.cat_idxs,
            cat_dims=self.cat_dims,
            cat_emb_dim=self.cat_emb_dim,
            n_independent=self.n_independent,
            n_shared=self.n_shared,
            epsilon=self.epsilon,
            virtual_batch_size=1024,
            momentum=self.momentum,
            device_name=self.device_name,
            mask_type=self.mask_type
        )
        self.network.load_state_dict(saved_state_dict)
        self.network.eval()
        return

    def fit_epoch(self, train_dataloader, valid_dataloader):
        """
        Evaluates and updates network for one epoch.
        Parameters
        ----------
            train_dataloader: a :class: `torch.utils.data.Dataloader`
                DataLoader with train set
            valid_dataloader: a :class: `torch.utils.data.Dataloader`
                DataLoader with valid set
        """
        train_metrics = self.train_epoch(train_dataloader)
        valid_metrics = self.predict_epoch(valid_dataloader)

        fit_metrics = {'train': train_metrics,
                       'valid': valid_metrics}

        return fit_metrics

    @abstractmethod
    def train_epoch(self, train_loader):
        """
        Trains one epoch of the network in self.network
        Parameters
        ----------
            train_loader: a :class: `torch.utils.data.Dataloader`
                DataLoader with train set
        """
        raise NotImplementedError('users must define train_epoch to use this base class')

    @abstractmethod
    def train_batch(self, data, targets):
        """
        Trains one batch of data
        Parameters
        ----------
            data: a :tensor: `torch.tensor`
                Input data
            target: a :tensor: `torch.tensor`
                Target data
        """
        raise NotImplementedError('users must define train_batch to use this base class')

    @abstractmethod
    def predict_epoch(self, loader):
        """
        Validates one epoch of the network in self.network
        Parameters
        ----------
            loader: a :class: `torch.utils.data.Dataloader`
                    DataLoader with validation set
        """
        raise NotImplementedError('users must define predict_epoch to use this base class')

    @abstractmethod
    def predict_batch(self, data, targets):
        """
        Make predictions on a batch (valid)
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            batch_outs: dict
        """
        raise NotImplementedError('users must define predict_batch to use this base class')

    def load_best_model(self):
        if self.best_network is not None:
            self.network = self.best_network

    @abstractmethod
    def predict(self, X):
        """
        Make predictions on a batch (valid)
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            predictions: np.array
                Predictions of the regression problem or the last class
        """
        raise NotImplementedError('users must define predict to use this base class')

    def explain(self, X):
        """
        Return local explanation
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            M_explain: matrix
                Importance per sample, per columns.
            masks: matrix
                Sparse matrix showing attention masks used by network.
        """
        self.network.eval()

        dataloader = DataLoader(PredictDataset(X),
                                batch_size=self.batch_size, shuffle=False)

        for batch_nb, data in enumerate(dataloader):
            data = data.to(self.device).float()

            M_explain, masks = self.network.forward_masks(data)
            for key, value in masks.items():
                masks[key] = csc_matrix.dot(value.cpu().detach().numpy(),
                                            self.reducing_matrix)

            if batch_nb == 0:
                res_explain = csc_matrix.dot(M_explain.cpu().detach().numpy(),
                                             self.reducing_matrix)
                res_masks = masks
            else:
                res_explain = np.vstack([res_explain,
                                         csc_matrix.dot(M_explain.cpu().detach().numpy(),
                                                        self.reducing_matrix)])
                for key, value in masks.items():
                    res_masks[key] = np.vstack([res_masks[key], value])
        return res_explain, res_masks

    def _compute_feature_importances(self, loader):
        self.network.eval()
        feature_importances_ = np.zeros((self.network.post_embed_dim))
        for data, targets,_ in loader:
            data = data.to(self.device).float()
            M_explain, masks = self.network.forward_masks(data)
            feature_importances_ += M_explain.sum(dim=0).cpu().detach().numpy()

        feature_importances_ = csc_matrix.dot(feature_importances_,
                                              self.reducing_matrix)
        self.feature_importances_ = feature_importances_ / np.sum(feature_importances_)
        


class TabNetRegressor(TabModel):

    def construct_loaders(self, X_train, y_scored_train,y_nscored_train, X_valid, y_valid, weights,
                          batch_size, num_workers, drop_last):
        """
        Returns
        -------
        train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader
            Training and validation dataloaders
        -------
        """
        if isinstance(weights, int):
            if weights == 1:
                raise ValueError("Please provide a list of weights for regression.")
        if isinstance(weights, dict):
            raise ValueError("Please provide a list of weights for regression.")



        train_dataloader, valid_dataloader = create_dataloaders(X_train,
                                                                y_scored_train,
                                                                y_nscored_train,
                                                                X_valid,
                                                                y_valid,
                                                                weights,
                                                                batch_size,
                                                                num_workers,
                                                                drop_last)
        return train_dataloader, valid_dataloader

    def update_fit_params(self, X_train, y_scored_train, y_nscored_train, X_valid, y_valid, loss_fn, loss_tr,
                          weights, max_epochs, patience,batch_size, virtual_batch_size, num_workers, drop_last):


        if loss_fn is None:
            self.loss_fn = torch.nn.functional.mse_loss
        else:
            self.loss_fn = loss_fn
            self.loss_tr = loss_tr

        assert X_train.shape[1] == X_valid.shape[1], "Dimension mismatch X_train X_valid"
        self.input_dim = X_train.shape[1]

        if len(y_scored_train.shape) == 1:
            raise ValueError("""Please apply reshape(-1, 1) to your targets
                                if doing single regression.""")
        assert y_scored_train.shape[1] == y_valid.shape[1], "Dimension mismatch y_train y_valid"
        self.output_dim1 = y_scored_train.shape[1]
        self.output_dim2 = y_nscored_train.shape[1]

        self.updated_weights = weights

        self.max_epochs = max_epochs
        self.patience = patience
        self.batch_size = batch_size
        self.virtual_batch_size = virtual_batch_size
        # Initialize counters and histories.
        self.patience_counter = 0
        self.epoch = 0
        self.best_cost = np.inf
        self.num_workers = num_workers
        self.drop_last = drop_last

    def train_epoch(self, train_loader):
        """
        Trains one epoch of the network in self.network
        Parameters
        ----------
            train_loader: a :class: `torch.utils.data.Dataloader`
                DataLoader with train set
        """

        self.network.train()
        y_preds = []
        ys = []
        total_loss = 0

        for data, targets_scored, targets_nscored in train_loader:
            batch_outs = self.train_batch(data, targets_scored, targets_nscored)
            y_preds.append(batch_outs["y_preds"].cpu().detach().numpy())
            ys.append(batch_outs["y"].cpu().detach().numpy())
            total_loss += batch_outs["loss"]

        y_preds = np.vstack(y_preds)
        ys = np.vstack(ys)

        #stopping_loss = mean_squared_error(y_true=ys, y_pred=y_preds)
        # stopping_loss =log_loss_multi(ys, torch.sigmoid(torch.as_tensor(y_preds)).numpy()  )
        total_loss = total_loss / len(train_loader)

        epoch_metrics = {'loss_avg': total_loss,
                         'stopping_loss': total_loss,
                         }

        # if self.scheduler is not None:
        #     self.scheduler.step()
            
        return epoch_metrics

    def train_batch(self, data, targets_scored, targets_nscored):
        """
        Trains one batch of data
        Parameters
        ----------
            data: a :tensor: `torch.tensor`
                Input data
            target: a :tensor: `torch.tensor`
                Target data
        """
        self.network.train()
        data = data.to(self.device).float()

        

        targets_scored = targets_scored.to(self.device).float()
        targets_nscored = targets_nscored.to(self.device).float()

        self.optimizer.zero_grad()

        # output1,output2, M_loss = self.network(data)
        output1, M_loss = self.network(data)

        loss1 = self.loss_fn(output1, targets_scored)
        # loss2 = self.loss_fn(output2, targets_nscored)
        loss = loss1 #+ loss2
        
        loss -= self.lambda_sparse*M_loss

        loss.backward()
        if self.clip_value:
            clip_grad_norm_(self.network.parameters(), self.clip_value)
        self.optimizer.step()

        loss_value = loss.item()
        batch_outs = {'loss': loss_value,
                      'y_preds': output1,
                      'y': targets_scored}
        return batch_outs

    def predict_epoch(self, loader):
        """
        Validates one epoch of the network in self.network
        Parameters
        ----------
            loader: a :class: `torch.utils.data.Dataloader`
                    DataLoader with validation set
        """
        y_preds = []
        ys = []
        self.network.eval()
        total_loss = 0

        for data, targets in loader:
            batch_outs = self.predict_batch(data, targets)
            total_loss += batch_outs["loss"]
            y_preds.append(batch_outs["y_preds"].cpu().detach().numpy())
            ys.append(batch_outs["y"].cpu().detach().numpy())

        y_preds = np.vstack(y_preds)
        ys = np.vstack(ys)

        stopping_loss = log_loss_multi(ys, torch.sigmoid(torch.as_tensor(y_preds)).numpy()  ) #mean_squared_error(y_true=ys, y_pred=y_preds)

        if self.scheduler is not None:
            self.scheduler.step(stopping_loss)

        total_loss = total_loss / len(loader)
        epoch_metrics = {'total_loss': total_loss,
                         'stopping_loss': stopping_loss}

        return epoch_metrics

    def predict_batch(self, data, targets):
        """
        Make predictions on a batch (valid)
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            batch_outs: dict
        """
        self.network.eval()
        data = data.to(self.device).float()
        targets = targets.to(self.device).float()

        # output,_, M_loss = self.network(data)
        output,M_loss = self.network(data)
       
        loss = self.loss_fn(output, targets)
        #print(self.loss_fn, loss)
        loss -= self.lambda_sparse*M_loss
        #print(loss)
        loss_value = loss.item()
        batch_outs = {'loss': loss_value,
                      'y_preds': output,
                      'y': targets}
        return batch_outs

    def predict(self, X):
        """
        Make predictions on a batch (valid)
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            predictions: np.array
                Predictions of the regression problem
        """
        self.network.eval()
        dataloader = DataLoader(PredictDataset(X),
                                batch_size=self.batch_size, shuffle=False)

        results = []
        for batch_nb, data in enumerate(dataloader):
            data = data.to(self.device).float()

            #output,_, M_loss = self.network(data)
            output, M_loss = self.network(data)
            predictions = output.cpu().detach().numpy()
            results.append(predictions)
        res = np.vstack(results)
        return res

In [33]:
device = "cuda" if torch.cuda.is_available() else "cpu"

EPOCHS = 300 #200
PATIENCE=40 #20
LEARNING_RATE = 1e-3 #1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5

BATCH_SIZE = 1024


save_name = "../data/tabnet-weights-public/tabnet-raw-public-step1/tabnet_raw_step1"

In [34]:
# LOCATE DRUGS
vc = scored.drug_id.value_counts()
vc1 = vc.loc[vc<=18].index.sort_values()
vc2 = vc.loc[vc>18].index.sort_values()

# STRATIFY DRUGS 18X OR LESS
dct1 = {}; dct2 = {}
skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, 
          random_state=seed)
tmp = scored.groupby('drug_id')[targets_scored].mean().loc[vc1]
for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets_scored])):
    dd = {k:fold for k in tmp.index[idxV].values}
    dct1.update(dd)

# STRATIFY DRUGS MORE THAN 18X
skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, 
          random_state=seed)
tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets_scored])):
    dd = {k:fold for k in tmp.sig_id[idxV].values}
    dct2.update(dd)

# ASSIGN FOLDS
train_df = train_df.merge(drug,on="sig_id")
train_df['fold'] = train_df.drug_id.map(dct1)
train_df.loc[train_df.fold.isna(),'fold'] =\
    train_df.loc[train_df.fold.isna(),'sig_id'].map(dct2)
train_df.fold = train_df.fold.astype('int8')

In [35]:

feat_cols = [c for c in train_df.columns if c not in ["sig_id","drug_id","fold"]]

In [36]:

# # ASSIGN FOLDS
# scored['fold'] = scored.drug_id.map(dct1)
# scored.loc[scored.fold.isna(),'fold'] =\
#     scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
# scored.fold = scored.fold.astype('int8')


# folds = train_df.copy()

# mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

# for f, (t_idx, v_idx) in enumerate(mskf.split(X=train_df, y=train_targets_scored)):
#     folds.loc[v_idx, 'kfold'] = int(f)

# folds['kfold'] = folds['kfold'].astype(int)
# folds

In [37]:
import math

def run_training(fold, seed):

    seed_everything(seed)
    
    
    
    train = train_df[train_df['fold'] != fold][feat_cols]
    valid = train_df[train_df['fold'] == fold][feat_cols]


    X_train, y_scored_train, y_nscored_train   = train.values, train_targets_scored.values[train.index, :], train_targets_nonscored.values[train.index, :]
    X_val, y_val = valid.values, train_targets_scored.values[valid.index, :]
    

    

    # model = TabNetRegressor(n_d=24, 
    #                         n_a=24, 
    #                         n_steps=1, 
    #                         gamma=1.3,
    #                         # cat_dims=cat_dims, 
    #                         # cat_emb_dim=cat_emb_dim, 
    #                         # cat_idxs=cats_idx,
    #                         lambda_sparse=0, 
    #                         optimizer_fn=torch.optim.Adam,
    #                         optimizer_params=dict(lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY), 
    #                         mask_type='entmax', 
    #                         device_name=device, 
    #                         scheduler_params=dict(pct_start=0.1, div_factor=1e4, final_div_factor=1e5,
    #                                           max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=math.ceil(x_train.shape[0]/BATCH_SIZE)), 
    #                         scheduler_fn=torch.optim.lr_scheduler.OneCycleLR)
    model = TabNetRegressor(n_d = 32,
                            n_a = 32,
                            n_steps = 1,
                            gamma = 1.3,
                            lambda_sparse = 0,
                            optimizer_fn = optim.Adam,
                            optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
                            mask_type = "entmax",
                            scheduler_params = dict(
                                mode = "min", patience = PATIENCE, min_lr = 1e-6, factor = 0.9),
                            scheduler_fn = ReduceLROnPlateau,
                            seed = seed,
                            verbose = 10)

                             


    loss_fn = nn.BCEWithLogitsLoss()
    loss_tr = SmoothBCEwLogits(smoothing =0.001)

    # model.fit(X_train=x_train, 
    #           y_scored_train=y_scored_train,
    #           y_nscored_train=y_nscored_train,  
    #           X_valid=x_valid, 
    #           y_valid=y_valid,
    #           max_epochs=EPOCHS,
    #           patience=EPOCHS, 
    #           batch_size=BATCH_SIZE, 
    #           virtual_batch_size=128,
    #           num_workers=0, 
    #           drop_last=False,
    #           weights=0,
    #           loss_fn=loss_fn,
    #           loss_tr=loss_tr)

    model.fit(
        X_train = X_train,
        y_scored_train=y_scored_train,
        y_nscored_train=y_nscored_train, 
        # eval_set = [(X_val, y_val)],
        X_valid=X_val, 
        y_valid=y_val,
        # eval_name = ["val"],
        # eval_metric = ["logits_ll"],
        max_epochs = EPOCHS,
        patience = PATIENCE,
        batch_size = BATCH_SIZE, 
        virtual_batch_size = 16,
        num_workers = 1,
        drop_last = False,
        # To use binary cross entropy because this is not a regression problem
        loss_fn = F.binary_cross_entropy_with_logits
    )


    oof = np.zeros((train_df.shape[0], train_targets_scored.shape[1]))
    
    model.load_best_model()
    preds = model.predict(X_val)
    oof[valid.index] = torch.sigmoid(torch.as_tensor(preds)).detach().cpu().numpy()

    X_test = test_df[feat_cols].values
    preds = model.predict(X_test)
    predictions = torch.sigmoid(torch.as_tensor(preds)).detach().cpu().numpy()
    
    return oof, predictions

In [38]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((train_df.shape[0], train_targets_scored.shape[1]))
    predictions = np.zeros((test_df.shape[0], train_targets_scored.shape[1]))
    
    for fold in range(NFOLDS):
        print(f"SEED {SEED} - FOLD {fold}")
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions


In [39]:
# Averaging on multiple SEEDS

# SEED = [0,1,2,3,4,5,6] #<-- Update
SEED = [0]
oof = np.zeros((train_df.shape[0], train_targets_scored.shape[1]))
predictions = np.zeros((test_df.shape[0], train_targets_scored.shape[1]))

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

train_df[train_targets_scored.columns] = oof
test_df[train_targets_scored.columns] = predictions

SEED [0] - FOLD 0
Device used : cuda
Will train until validation stopping metric hasn't improved in 40 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 10    |  0.02565 |   0.02093 |   30.0       True
| 20    |  0.02320 |   0.02025 |   60.0       False
| 30    |  0.02244 |   0.01881 |   90.1       False
| 40    |  0.02214 |   0.01925 |   119.9      False
| 50    |  0.02148 |   0.01821 |   149.8      False
| 60    |  0.02118 |   0.01821 |   179.7      False
| 70    |  0.02139 |   0.01866 |   209.4      False
| 80    |  0.02117 |   0.01806 |   239.3      False
| 90    |  0.02092 |   0.01807 |   269.1      False
| 100   |  0.02100 |   0.01811 |   298.9      False
Early stopping occured at epoch 107
Training done in 319.839 seconds.
---------------------------------------
SEED [0] - FOLD 1
Device used : cuda
Will train until validation stopping metric hasn't improved in 40 rounds.
---------------------------------------
| EPOCH |  train  |   v

In [40]:
# valid_results = train_targets_scored.drop(columns=train_targets_scored.columns).merge(train[['sig_id']+target_scored_cols], on='sig_id', how='left').fillna(0)



y_true = train_targets_scored.values
y_pred = train_df[train_targets_scored.columns].values

score = 0
for i in range(len(train_targets_scored.columns)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / train_targets_scored.shape[1]
    
print("CV log_loss: ", score)

CV log_loss:  0.017721322287197177


In [41]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_true,y_pred))

0.6236351560001452


In [42]:
#baseline CV log_loss:  0.01754342404326328 0.6248144785829379
#without kmeans : CV log_loss:  0.017544906477526573 0.6324338816269723
#g600 c50 : CV log_loss:  0.01758957586194886 - 0.629949728832278
#64/64 CV log_loss:  0.017583096074781448 - 0.6209159297054776

#with loss2 - CV log_loss:  0.017794634325947235 0.6275013092647985
#with loss2 PCA g600 c50 - CV log_loss:  0.01781988120869796 0.6298911856651602
#with loss2 patience 25 - CV log_loss:  0.01779312218380405 0.6281097324835089
#with loss2 patience 30 - CV log_loss:  0.017790775082051624 0.6293814639679891
#with loss2 patience 35 - CV log_loss:  0.01779312218380405 0.6281097324835089
#with loss2 patience 30 + virtual_batch_size 64 - CV log_loss:  0.01787414236838633 0.6346938458990472
#with loss2 patience 30 + virtual_batch_size 16 - CV log_loss:  0.017741189570392005 0.6226257296329173
#with loss2 patience 35 + virtual_batch_size 16 - CV log_loss:  0.017730623769838815 0.6249452053517877
#with loss2 patience 40/ EPOCH 300 + virtual_batch_size 16 - CV log_loss:  0.017715413609244578 0.6266322089415273

#pca g 100 CV log_loss:  0.017574289640172935 0.6241280400069048
#pca g 140 CV log_loss:  CV log_loss:  0.017551766904420348 0.6303389793487976
#relu -> prelu : CV log_loss:  0.017567823483257198 0.6293113851846648

In [43]:

#baseline model              CV log_loss:  0.016687406017433053 0.7492731581216698
# patience 30                CV log_loss:  0.01690850658900589  0.735966961676482
# stats g,c (sum,std)        CV log_loss:  0.01682518316741461  0.7404482804848934
# stats g,c,gc (sum,std)     CV log_loss:  0.016810316073916132 0.7439597605459504
# stats g,c,gc (sum,std,mean)CV log_loss:  0.016802178301951694 0.7393447505525768
# stats g,c (sum,std,mean) , gc (sum,std) CV log_loss:  0.01680768146509414 0.7398723816282889
# stats g,c (sum,std,mean,kurt, gc (sum,std) CV log_loss:  0.016789419505802778 0.7452451122810315
# stats gc,c (+skew)                         CV log_loss:  0.01676525710038393 0.738088542786878

#modif PCA
#CV log_loss :  0.016842789818741998 - 0.7398728478185493
#g 80 -> 100 : CV log_loss:  0.0168098197740287 0.7426392383610864
#g80 -> 100 + all stats : CV log_loss:  0.01680981977402873 0.7426392383610864
#g100 c10 -> c20 CV log_loss:  0.016816606944252275 0.746173280900466
#g100 -> 150 CV log_loss:  0.01675937045748948 - 0.747606042991269
#g150 -> 200 CV log_loss:  0.016843896450527557- 0.7365906276733302
#c20 -> 30 CV log_loss:  0.016823623179551828 0.7407075622326613
#relu -> prelu  CV log_loss:  0.016814056825770442 0.7449464736120271