In [1]:
import os
import os.path
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Oct 26 15:46 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Oct 26 15:46 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Oct 26 15:46 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Oct 26 15:46 custom.css
-rw-r--r-- 1 nobody nogroup 359M Oct 26 15:46 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Oct 26 15:46 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Oct 26 15:46 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Oct 26 15:46 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Oct 26 15:46 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Oct 26 15:46 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Oct 26 15:46 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Oct 26 15:46 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Oct 26 15:46 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

## Read Data

In [5]:
%%time
train_inp_val_path = f'{DATA_DIR}/train_cite_inputs_val.sparse.npz'
train_inp_val = scipy.sparse.load_npz(train_inp_val_path)
train_inp_val.shape

CPU times: user 13.9 s, sys: 1.58 s, total: 15.4 s
Wall time: 21.1 s


(70988, 22050)

In [6]:
%%time
train_tar_val_path = f'{DATA_DIR}/train_cite_targets_val.sparse.npz'
train_tar_val = scipy.sparse.load_npz(train_tar_val_path)
train_tar_val.shape

CPU times: user 363 ms, sys: 56.2 ms, total: 420 ms
Wall time: 1.08 s


(70988, 140)

## Dimension Reduction

In [7]:
from sklearn.decomposition import PCA, TruncatedSVD

In [8]:
%%time
## Baseline
train_inp_tsvd = TruncatedSVD(n_components=64, 
                              random_state=42)

train_inp_val = train_inp_tsvd.fit_transform(train_inp_val)
print('Sum of Explained Variance: ',
      np.sum(train_inp_tsvd.explained_variance_ratio_))

gc.collect()

Sum of Explained Variance:  0.12513325
CPU times: user 2min 25s, sys: 3.47 s, total: 2min 28s
Wall time: 2min 23s


21

In [9]:
%%time
train_tar_val = train_tar_val.todense()
gc.collect()

CPU times: user 160 ms, sys: 42 ms, total: 202 ms
Wall time: 200 ms


21

## Modeling

### Loss Function

In [10]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    It is assumed that the predictions are not constant.
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

### Modeling

In [11]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [12]:
%%time
X_train, X_val, y_train, y_val = train_test_split(train_inp_val,
                                                  train_tar_val,
                                                  test_size=0.2,
                                                  random_state=42)

CPU times: user 27.5 ms, sys: 979 µs, total: 28.4 ms
Wall time: 28.5 ms


In [13]:
del train_inp_val, train_tar_val
gc.collect()

84

In [14]:
%%time
ridge = Ridge(copy_X=False, random_state=42)
ridge.fit(X_train, y_train)

CPU times: user 130 ms, sys: 34.1 ms, total: 165 ms
Wall time: 74.6 ms


Ridge(copy_X=False, random_state=42)

In [15]:
%%time
correlation_score(y_train, ridge.predict(X_train))

CPU times: user 6.41 s, sys: 237 ms, total: 6.65 s
Wall time: 6.24 s


0.20340553575278125

In [16]:
%%time
correlation_score(y_val, ridge.predict(X_val))

CPU times: user 1.75 s, sys: 224 ms, total: 1.98 s
Wall time: 1.6 s


0.8903543005715396

In [17]:
del X_train, X_val, y_train, y_val
gc.collect()

84

## Prediction 

In [18]:
%%time
test_inp_val_path = f'{DATA_DIR}/test_cite_inputs_val.sparse.npz'
test_inp_val = scipy.sparse.load_npz(test_inp_val_path)
test_inp_val.shape

CPU times: user 9.56 s, sys: 658 ms, total: 10.2 s
Wall time: 14 s


(48663, 22050)

In [19]:
%%time
test_inp_val = train_inp_tsvd.transform(test_inp_val)
test_inp_val.shape

CPU times: user 5.75 s, sys: 5.04 ms, total: 5.76 s
Wall time: 5.76 s


(48663, 64)

In [20]:
%%time
test_tar_preds = ridge.predict(test_inp_val)
del test_inp_val
gc.collect()

CPU times: user 388 ms, sys: 212 ms, total: 600 ms
Wall time: 191 ms


103

## Creating Submission

### Original Dataset

In [21]:
test_tar_cols = np.load(f'{DATA_DIR}/train_cite_targets_idx.npz',
                        allow_pickle=True)['columns']
test_tar_idx = np.load(f'{DATA_DIR}/test_cite_inputs_idx.npz',
                       allow_pickle=True)['index']
test_tar_cols.shape, test_tar_idx.shape, test_tar_preds.shape

((140,), (48663,), (48663, 140))

In [22]:
TMP_DIR = '../tmp'

In [23]:
%mkdir ../tmp

In [24]:
%%time
test_preds = {cell_id: {gene_id: test_tar_preds[i][j]
                       for j, gene_id in enumerate(test_tar_cols, 0)}
              for i, cell_id in enumerate(test_tar_idx, 0)}


CPU times: user 2.58 s, sys: 246 ms, total: 2.83 s
Wall time: 2.84 s


In [25]:
del test_tar_cols, test_tar_idx, test_tar_preds
gc.collect()

21

In [26]:
%%time
eval_ids = pd.read_parquet(f'{DATA_DIR}/evaluation_ids.parquet')
eval_ids.head()

CPU times: user 13.3 s, sys: 5.9 s, total: 19.2 s
Wall time: 14.5 s


Unnamed: 0,row_id,cell_id,gene_id
0,0,c2150f55becb,CD86
1,1,c2150f55becb,CD274
2,2,c2150f55becb,CD270
3,3,c2150f55becb,CD155
4,4,c2150f55becb,CD112


In [27]:
%%time
eval_ids['target'] = eval_ids.apply(lambda x: test_preds[x['cell_id']][x['gene_id']]
                                    if x['cell_id'] in test_preds.keys() else 0,
                                    axis=1)

CPU times: user 12min 24s, sys: 14.6 s, total: 12min 39s
Wall time: 12min 38s


In [28]:
eval_ids.drop(['cell_id', 'gene_id'], axis=1)\
        .to_csv('cite_sub.csv', index=False)