In [1]:
import os
import os.path
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Oct 25 09:30 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Oct 25 09:30 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Oct 25 09:30 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Oct 25 09:30 custom.css
-rw-r--r-- 1 nobody nogroup 359M Oct 25 09:30 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Oct 25 09:30 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Oct 25 09:30 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Oct 25 09:30 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Oct 25 09:30 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Oct 25 09:30 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Oct 25 09:30 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Oct 25 09:31 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Oct 25 09:30 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

## Read Data

In [5]:
%%time
train_inp_val_path = f'{DATA_DIR}/train_multi_inputs_val.sparse.npz'
train_inp_val = scipy.sparse.load_npz(train_inp_val_path)
train_inp_val.shape

CPU times: user 33.9 s, sys: 3.11 s, total: 37 s
Wall time: 1min


(105942, 228942)

In [6]:
%%time
train_tar_val_path = f'{DATA_DIR}/train_multi_targets_val.sparse.npz'
train_tar_val = scipy.sparse.load_npz(train_tar_val_path)
train_tar_val.shape

CPU times: user 15.4 s, sys: 1.38 s, total: 16.8 s
Wall time: 23.5 s


(105942, 23418)

## Dimension Reduction

In [7]:
from sklearn.decomposition import PCA, TruncatedSVD

In [8]:
# %%time
# ## Baseline
# ## https://www.kaggle.com/code/fabiencrom/msci-multiome-quickstart-w-sparse-matrices
# ## Sum of explained variance ratio = 0.00765151
# tsvd = TruncatedSVD(n_components=16, 
#                     random_state=42)
# np.sum(tsvd.fit(train_inp_val).explained_variance_ratio_)
# gc.collect()

In [9]:
%%time
## Baseline
train_inp_tsvd = TruncatedSVD(n_components=128, 
                              random_state=42)

train_inp_val = train_inp_tsvd.fit_transform(train_inp_val)
print('Sum of Explained Variance: ',
      np.sum(train_inp_tsvd.explained_variance_ratio_))

gc.collect()

Sum of Explained Variance:  0.0109976595
CPU times: user 17min 36s, sys: 9.74 s, total: 17min 46s
Wall time: 17min 27s


42

In [10]:
%%time
train_tar_val = train_tar_val.todense()
gc.collect()

CPU times: user 4.81 s, sys: 10.8 s, total: 15.6 s
Wall time: 15.6 s


21

## Modeling

### Loss Function

In [11]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    It is assumed that the predictions are not constant.
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

### Modeling

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [13]:
%%time
X_train, X_val, y_train, y_val = train_test_split(train_inp_val,
                                                  train_tar_val,
                                                  test_size=0.2,
                                                  random_state=42)

CPU times: user 1.28 s, sys: 2.78 s, total: 4.06 s
Wall time: 4.06 s


In [14]:
del train_inp_val, train_tar_val
gc.collect()

42

In [15]:
%%time
ridge = Ridge(copy_X=False, random_state=42)
ridge.fit(X_train, y_train)

CPU times: user 22.5 s, sys: 4.53 s, total: 27 s
Wall time: 12.3 s


Ridge(copy_X=False, random_state=42)

In [16]:
# %%time
# correlation_score(ridge.predict(X_train), y_train)

In [17]:
%%time
correlation_score(y_val, ridge.predict(X_val))

CPU times: user 10.7 s, sys: 1.87 s, total: 12.6 s
Wall time: 8.51 s


0.664523262485194

In [18]:
del X_train, X_val, y_train, y_val
gc.collect()

84

## Prediction

In [19]:
%%time
test_inp_val_path = f'{DATA_DIR}/test_multi_inputs_val.sparse.npz'
test_inp_val = scipy.sparse.load_npz(test_inp_val_path)
test_inp_val.shape

CPU times: user 19.6 s, sys: 1.35 s, total: 21 s
Wall time: 36.2 s


(55935, 228942)

In [20]:
%%time
test_inp_val = train_inp_tsvd.transform(test_inp_val)
test_inp_val.shape

CPU times: user 34.6 s, sys: 68.2 ms, total: 34.7 s
Wall time: 34.7 s


(55935, 128)

In [21]:
%%time
test_tar_preds = ridge.predict(test_inp_val)
del test_inp_val
gc.collect()

CPU times: user 11 s, sys: 4.11 s, total: 15.1 s
Wall time: 5.02 s


103

## Creating Submission

In [22]:
test_tar_cols = np.load(f'{DATA_DIR}/train_multi_targets_idx.npz',
                        allow_pickle=True)['columns']
test_tar_idx = np.load(f'{DATA_DIR}/test_multi_inputs_idx.npz',
                       allow_pickle=True)['index']
test_tar_cols.shape, test_tar_idx.shape, test_tar_preds.shape

((23418,), (55935,), (55935, 23418))

In [23]:
%%time
print('Start Eval...')
eval_ids = pd.read_parquet(f'{DATA_DIR}/evaluation_ids.parquet')
eval_ids.head()

eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

Start Eval...
CPU times: user 26.7 s, sys: 8.22 s, total: 34.9 s
Wall time: 30.4 s


In [24]:
%%time
sub = pd.Series(name='target',
                index=pd.MultiIndex.from_frame(eval_ids), 
                dtype=np.float32)
sub

CPU times: user 18.1 s, sys: 3.33 s, total: 21.4 s
Wall time: 21.5 s


row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [25]:
cell_id_dict = {cell_id: idx 
                for idx, cell_id in enumerate(test_tar_idx, 0)}
gene_id_dict = {gene_id: idx 
                for idx, gene_id in enumerate(test_tar_cols, 0)}

In [26]:
eid_cid_idx = eval_ids['cell_id']\
              .apply(lambda x: cell_id_dict.get(x, -1))
eid_gid_idx = eval_ids['gene_id']\
              .apply(lambda x: gene_id_dict.get(x, -1))
valid_multi_rows = (eid_cid_idx != -1) & (eid_gid_idx != -1)

In [27]:
%%time
sub.iloc[valid_multi_rows] = test_tar_preds\
                             [eid_cid_idx[valid_multi_rows].to_numpy(),
                              eid_gid_idx[valid_multi_rows].to_numpy()]

CPU times: user 1.63 s, sys: 484 ms, total: 2.11 s
Wall time: 2.11 s


In [28]:
del eval_ids, test_tar_idx, test_tar_cols
del eid_cid_idx, eid_gid_idx, valid_multi_rows
gc.collect()

0

In [29]:
sub = pd.DataFrame(sub).fillna(0).reset_index()
sub.drop(['cell_id', 'gene_id'], axis=1)\
   .to_csv('multi_sub.csv', index=False)