In [1]:
import os
import os.path
import random
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

## Load Model

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset, random_split

In [4]:
DATA_DIR = '../input/msci-multi-mlp-sparse'
%ls $DATA_DIR -lh

total 124M
-rw-r--r-- 1 nobody nogroup  72K Nov  6 19:06 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  33K Nov  6 19:06 __output__.json
-rw-r--r-- 1 nobody nogroup 330K Nov  6 19:06 __results__.html
drwxr-xr-x 2 nobody nogroup    0 Nov  6 19:06 [0m[01;34m__results___files[0m/
-rw-r--r-- 1 nobody nogroup    0 Nov  6 19:06 custom.css
-rw-r--r-- 1 nobody nogroup 124M Nov  6 19:06 mlp_multi.pt


In [5]:
class Net(nn.Module):
    def __init__(self):
        """In the constructor we instantiate two nn.Linear modules and assign them as
        member variables (self).
        """
        super(Net, self).__init__()
        self.linear1 = nn.Linear(228942, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.linear2 = nn.Linear(128, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.linear3 = nn.Linear(128, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.linear4 = nn.Linear(128, 23418)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        x = self.linear1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.linear3(x)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.linear4(x)
        x = F.relu(x)

        return x

In [6]:
net = torch.load(f'{DATA_DIR}/mlp_multi.pt',
                 map_location=torch.device('cpu'))

## Prediction

In [7]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Nov  6 19:06 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Nov  6 19:06 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Nov  6 19:06 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Nov  6 19:06 custom.css
-rw-r--r-- 1 nobody nogroup 359M Nov  6 19:07 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Nov  6 19:06 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Nov  6 19:06 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Nov  6 19:06 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Nov  6 19:06 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Nov  6 19:06 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Nov  6 19:06 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Nov  6 19:07 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Nov  6 19:06 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

In [8]:
%%time
test_inp = scipy.sparse.load_npz(f'{DATA_DIR}/test_multi_inputs_val.sparse.npz')
test_inp.shape

CPU times: user 21 s, sys: 2.49 s, total: 23.5 s
Wall time: 37.4 s


(55935, 228942)

In [9]:
class SparsePredDataset(TensorDataset):
    # https://discuss.pytorch.org/t/dataloader-loads-data-very-slow-on-sparse-tensor/117391/2
    def __init__(self, inputs):
        self.inputs = inputs
    
    def __getitem__(self, index):
        return self.inputs[index]
    
    def __len__(self):
        return self.inputs.shape[0]

In [10]:
def sparse_coo_to_tensor(coo):
    values = coo.data
    indices = np.vstack((coo.row, coo.col))
    shape = coo.shape
    
    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    s = torch.Size(shape)
    
    return torch.sparse.FloatTensor(i, v, s)

def sparse_batch_collate_pred(batch):
    return sparse_coo_to_tensor(scipy.sparse.vstack(batch).tocoo())

In [11]:
batch_size = 1024
test_ds = SparsePredDataset(test_inp)
test_loader = DataLoader(test_ds, 
                         batch_size=batch_size, 
                         shuffle=False,
                         collate_fn=sparse_batch_collate_pred)

In [12]:
%%time
with torch.no_grad():
    net.eval()
    test_tar_preds = []
    
    for i, x in enumerate(test_loader):
#         x = x.to(device)
        batch_preds = net(x).detach().numpy()
        test_tar_preds.append(batch_preds)

CPU times: user 10min 6s, sys: 9.78 s, total: 10min 16s
Wall time: 8min 56s


In [13]:
test_tar_preds = np.vstack(test_tar_preds)
test_tar_preds.shape

(55935, 23418)

In [14]:
del test_inp, test_ds, test_loader
gc.collect()

42

## Creating Submission

In [15]:
test_tar_cols = np.load(f'{DATA_DIR}/train_multi_targets_idx.npz',
                        allow_pickle=True)['columns']
test_tar_idx = np.load(f'{DATA_DIR}/test_multi_inputs_idx.npz',
                       allow_pickle=True)['index']
test_tar_cols.shape, test_tar_idx.shape, test_tar_preds.shape

((23418,), (55935,), (55935, 23418))

In [16]:
%%time
print('Start Eval...')
eval_ids = pd.read_parquet(f'{DATA_DIR}/evaluation_ids.parquet')
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

Start Eval...
CPU times: user 39.9 s, sys: 21.2 s, total: 1min 1s
Wall time: 49.8 s


In [17]:
%%time
sub = pd.Series(name='target',
                index=pd.MultiIndex.from_frame(eval_ids), 
                dtype=np.float32)
sub

CPU times: user 23.5 s, sys: 7.04 s, total: 30.5 s
Wall time: 30.6 s


row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [18]:
cell_id_dict = {cell_id: idx 
                for idx, cell_id in enumerate(test_tar_idx, 0)}
gene_id_dict = {gene_id: idx 
                for idx, gene_id in enumerate(test_tar_cols, 0)}

In [19]:
eid_cid_idx = eval_ids['cell_id']\
              .apply(lambda x: cell_id_dict.get(x, -1))
eid_gid_idx = eval_ids['gene_id']\
              .apply(lambda x: gene_id_dict.get(x, -1))
valid_multi_rows = (eid_cid_idx != -1) & (eid_gid_idx != -1)

In [20]:
%%time
sub.iloc[valid_multi_rows] = test_tar_preds\
                             [eid_cid_idx[valid_multi_rows].to_numpy(),
                              eid_gid_idx[valid_multi_rows].to_numpy()]

CPU times: user 1.74 s, sys: 1.5 s, total: 3.24 s
Wall time: 3.25 s


In [21]:
del eval_ids, test_tar_idx, test_tar_cols
del eid_cid_idx, eid_gid_idx, valid_multi_rows
gc.collect()

42

In [22]:
sub = pd.DataFrame(sub).fillna(0).reset_index()
sub.drop(['cell_id', 'gene_id'], axis=1)\
   .to_csv('multi_sub.csv', index=False)

In [23]:
sub.tail()

Unnamed: 0,row_id,cell_id,gene_id,target
65744175,65744175,2c53aa67933d,ENSG00000134419,0.028947
65744176,65744176,2c53aa67933d,ENSG00000186862,0.0
65744177,65744177,2c53aa67933d,ENSG00000170959,0.0
65744178,65744178,2c53aa67933d,ENSG00000107874,0.005447
65744179,65744179,2c53aa67933d,ENSG00000166012,0.027957
