In [1]:
import os
import os.path
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/open-problems-multimodal'
%ls $DATA_DIR -lh

total 27G
-rw-r--r-- 1 nobody nogroup 2.3G Sep  7 19:44 evaluation_ids.csv
-rw-r--r-- 1 nobody nogroup 9.4M Sep  7 19:43 metadata.csv
-rw-r--r-- 1 nobody nogroup 230K Sep  7 19:43 metadata_cite_day_2_donor_27678.csv
-rw-r--r-- 1 nobody nogroup 805M Sep  7 19:43 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 1.6G Sep  7 19:44 test_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup 294M Sep  7 19:43 test_cite_inputs_day_2_donor_27678.h5
-rw-r--r-- 1 nobody nogroup 6.1G Sep  7 19:45 test_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 2.4G Sep  7 19:44 train_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup  37M Sep  7 19:43 train_cite_targets.h5
-rw-r--r-- 1 nobody nogroup  11G Sep  7 19:46 train_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 3.0G Sep  7 19:45 train_multi_targets.h5


## Read Data

In [5]:
%%time
train_inp = pd.read_hdf(f'{DATA_DIR}/train_cite_inputs.h5')
train_inp_cols = train_inp.columns

CPU times: user 30.7 s, sys: 7.85 s, total: 38.5 s
Wall time: 58.5 s


In [6]:
%%time
test_inp = pd.read_hdf(f'{DATA_DIR}/test_cite_inputs.h5')

CPU times: user 21.4 s, sys: 4.6 s, total: 26 s
Wall time: 40.7 s


In [7]:
%%time
train_tar = pd.read_hdf(f'{DATA_DIR}/train_cite_targets.h5')
train_tar_cols = train_tar.columns

CPU times: user 233 ms, sys: 81.8 ms, total: 315 ms
Wall time: 702 ms


Find columns with all zeroes

In [8]:
%%time
zero_cols = []
for idx, col in enumerate(train_inp_cols, 0):
    if idx % 1000 == 0:
        print(idx)
    if len(train_inp[col].unique()) == 1 or len(test_inp[col].unique()) == 1:
        zero_cols.append(col)
print('Number of columns with zero values only (Train or Test):', 
      len(zero_cols))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
Number of columns with zero values only (Train or Test): 1194
CPU times: user 41.2 s, sys: 418 ms, total: 41.7 s
Wall time: 41.7 s


In [9]:
%%time
train_inp = train_inp.drop(zero_cols, axis=1)
train_inp_cols = train_inp.columns
test_inp = test_inp.drop(zero_cols, axis=1)
train_inp.shape, test_inp.shape

CPU times: user 1.67 s, sys: 3.19 s, total: 4.87 s
Wall time: 4.84 s


((70988, 20856), (48663, 20856))

## Dimension Reduction

Find features in target columns that are related to input columns due to identical names

In [10]:
%%time
same_name_cols = []
for tar_col in train_tar_cols:
    for inp_col in train_inp_cols:
        if tar_col in inp_col:
            same_name_cols.append(inp_col)
print('Number of target columns shown in training columns:', 
      len(same_name_cols))

Number of target columns shown in training columns: 144
CPU times: user 553 ms, sys: 0 ns, total: 553 ms
Wall time: 554 ms


Keep identical name columns and transform the rest to sparse matrices

In [11]:
train_inp_sn = train_inp[same_name_cols].values
test_inp_sn = test_inp[same_name_cols].values

In [12]:
train_inp = scipy.sparse.csr_matrix(train_inp.values)
test_inp = scipy.sparse.csr_matrix(test_inp.values)
gc.collect()

63

Reduce dimension

In [13]:
from sklearn.decomposition import PCA, TruncatedSVD

In [14]:
%%time
## Baseline
train_inp_tsvd = TruncatedSVD(n_components=512, 
                              random_state=42)

train_inp = train_inp_tsvd.fit_transform(train_inp)
print('Sum of Explained Variance: ',
      np.sum(train_inp_tsvd.explained_variance_ratio_))
train_inp = np.hstack([train_inp, train_inp_sn])
del train_inp_sn
gc.collect()

Sum of Explained Variance:  0.19551416
CPU times: user 25min 55s, sys: 15.2 s, total: 26min 11s
Wall time: 25min 47s


21

In [15]:
test_inp = train_inp_tsvd.transform(test_inp)
test_inp = np.hstack([test_inp, test_inp_sn])
del test_inp_sn
gc.collect()

21

## Modeling

### Loss Function

In [16]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    It is assumed that the predictions are not constant.
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

### Modeling

In [17]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [18]:
%%time
X_train, X_val, y_train, y_val = train_test_split(train_inp,
                                                  train_tar,
                                                  test_size=0.2,
                                                  random_state=42)

CPU times: user 104 ms, sys: 38 ms, total: 142 ms
Wall time: 144 ms


In [19]:
del train_inp, train_tar
gc.collect()

84

In [20]:
%%time
ridge = Ridge(copy_X=False, random_state=42)
ridge.fit(X_train, y_train)

CPU times: user 1.53 s, sys: 175 ms, total: 1.71 s
Wall time: 522 ms


Ridge(copy_X=False, random_state=42)

In [21]:
%%time
correlation_score(y_train, ridge.predict(X_train))

CPU times: user 6.64 s, sys: 236 ms, total: 6.87 s
Wall time: 6.17 s


0.22166283035863046

In [22]:
%%time
correlation_score(y_val, ridge.predict(X_val))

CPU times: user 1.79 s, sys: 207 ms, total: 2 s
Wall time: 1.55 s


0.8928016074163969

In [23]:
del X_train, X_val, y_train, y_val
gc.collect()

84

## Prediction 

In [24]:
%%time
test_tar_preds = ridge.predict(test_inp)
test_tar_preds.shape

CPU times: user 436 ms, sys: 44.9 ms, total: 481 ms
Wall time: 142 ms


(48663, 140)

In [25]:
del test_inp
gc.collect()

42

## Creating Submission

In [26]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Oct 25 09:30 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Oct 25 09:30 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Oct 25 09:30 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Oct 25 09:30 custom.css
-rw-r--r-- 1 nobody nogroup 359M Oct 25 09:30 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Oct 25 09:30 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Oct 25 09:30 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Oct 25 09:30 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Oct 25 09:30 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Oct 25 09:30 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Oct 25 09:30 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Oct 25 09:31 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Oct 25 09:30 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

In [27]:
test_tar_cols = np.load(f'{DATA_DIR}/train_cite_targets_idx.npz',
                        allow_pickle=True)['columns']
test_tar_idx = np.load(f'{DATA_DIR}/test_cite_inputs_idx.npz',
                       allow_pickle=True)['index']
test_tar_cols.shape, test_tar_idx.shape, test_tar_preds.shape

((140,), (48663,), (48663, 140))

In [28]:
%%time
print('Start Eval...')
eval_ids = pd.read_parquet(f'{DATA_DIR}/evaluation_ids.parquet')
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

Start Eval...
CPU times: user 29.6 s, sys: 13.5 s, total: 43.1 s
Wall time: 36 s


In [29]:
%%time
sub = pd.Series(name='target',
                index=pd.MultiIndex.from_frame(eval_ids), 
                dtype=np.float32)
sub

CPU times: user 22.3 s, sys: 3.65 s, total: 25.9 s
Wall time: 25.9 s


row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [30]:
cell_id_dict = {cell_id: idx 
                for idx, cell_id in enumerate(test_tar_idx, 0)}
gene_id_dict = {gene_id: idx 
                for idx, gene_id in enumerate(test_tar_cols, 0)}

In [31]:
eid_cid_idx = eval_ids['cell_id']\
              .apply(lambda x: cell_id_dict.get(x, -1))
eid_gid_idx = eval_ids['gene_id']\
              .apply(lambda x: gene_id_dict.get(x, -1))
valid_cite_rows = (eid_cid_idx != -1) & (eid_gid_idx != -1)

In [32]:
%%time
sub.iloc[valid_cite_rows] = test_tar_preds\
                             [eid_cid_idx[valid_cite_rows].to_numpy(),
                              eid_gid_idx[valid_cite_rows].to_numpy()]

CPU times: user 383 ms, sys: 230 ms, total: 613 ms
Wall time: 613 ms


In [33]:
del eval_ids, test_tar_idx, test_tar_cols
del eid_cid_idx, eid_gid_idx, valid_cite_rows
gc.collect()

97

In [34]:
sub = pd.DataFrame(sub).fillna(0).reset_index()
sub.drop(['cell_id', 'gene_id'], axis=1)\
   .to_csv('cite_sub.csv', index=False)