In [1]:
import os
import os.path
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/open-problems-multimodal'
%ls $DATA_DIR -lh

total 27G
-rw-r--r-- 1 nobody nogroup 2.3G Sep  7 19:51 evaluation_ids.csv
-rw-r--r-- 1 nobody nogroup 9.4M Sep  7 19:50 metadata.csv
-rw-r--r-- 1 nobody nogroup 230K Sep  7 19:50 metadata_cite_day_2_donor_27678.csv
-rw-r--r-- 1 nobody nogroup 805M Sep  7 19:50 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 1.6G Sep  7 19:51 test_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup 294M Sep  7 19:50 test_cite_inputs_day_2_donor_27678.h5
-rw-r--r-- 1 nobody nogroup 6.1G Sep  7 19:52 test_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 2.4G Sep  7 19:51 train_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup  37M Sep  7 19:50 train_cite_targets.h5
-rw-r--r-- 1 nobody nogroup  11G Sep  7 19:52 train_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 3.0G Sep  7 19:51 train_multi_targets.h5


## Read Data

In [5]:
%%time
train_inp = pd.read_hdf(f'{DATA_DIR}/train_cite_inputs.h5')

CPU times: user 18.2 s, sys: 2.93 s, total: 21.1 s
Wall time: 42.8 s


In [6]:
%%time
train_tar = pd.read_hdf(f'{DATA_DIR}/train_cite_targets.h5')
train_tar_cols = train_tar.columns

CPU times: user 143 ms, sys: 28.4 ms, total: 172 ms
Wall time: 587 ms


## Modeling

### Loss Function

In [7]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    It is assumed that the predictions are not constant.
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

### Modeling

In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

In [9]:
%%time
X_train, X_val, y_train, y_val = train_test_split(train_inp,
                                                  train_tar,
                                                  test_size=0.2,
                                                  random_state=42)

CPU times: user 6.97 s, sys: 566 ms, total: 7.53 s
Wall time: 7.48 s


In [10]:
del train_inp, train_tar
gc.collect()

42

In [11]:
%%time
knn = KNeighborsRegressor(n_neighbors=9)
knn.fit(X_train, y_train)

CPU times: user 746 ms, sys: 0 ns, total: 746 ms
Wall time: 746 ms


KNeighborsRegressor(n_neighbors=9)

In [12]:
%%time
correlation_score(y_train, knn.predict(X_train))

CPU times: user 1h 49min 38s, sys: 5min 1s, total: 1h 54min 39s
Wall time: 30min 6s


0.8829513861350354

In [13]:
%%time
correlation_score(y_val, knn.predict(X_val))

CPU times: user 27min 30s, sys: 1min 16s, total: 28min 47s
Wall time: 7min 33s


0.8604450560781888

In [14]:
del X_train, X_val, y_train, y_val
gc.collect()

21

## Prediction 

In [15]:
%%time
test_inp = pd.read_hdf(f'{DATA_DIR}/test_cite_inputs.h5')
test_inp_idx = test_inp.index

CPU times: user 12.8 s, sys: 1.84 s, total: 14.6 s
Wall time: 29.4 s


In [16]:
%%time
test_tar_preds = knn.predict(test_inp)
test_tar_preds.shape

CPU times: user 1h 33min 59s, sys: 4min 12s, total: 1h 38min 12s
Wall time: 25min 45s


(48663, 140)

In [17]:
del test_inp
gc.collect()

121

## Creating Submission

In [18]:
%%time
test_preds = {cell_id: {gene_id: test_tar_preds[i][j]
                       for j, gene_id in enumerate(train_tar_cols, 0)}
              for i, cell_id in enumerate(test_inp_idx, 0)}

CPU times: user 2.64 s, sys: 440 ms, total: 3.08 s
Wall time: 3.08 s


In [19]:
del test_tar_preds
gc.collect()

21

In [20]:
%%time
eval_ids = pd.read_parquet(f'../input/msci-h5-sparse-transform/evaluation_ids.parquet')
eval_ids.head()

CPU times: user 10.3 s, sys: 3.2 s, total: 13.5 s
Wall time: 11.5 s


Unnamed: 0,row_id,cell_id,gene_id
0,0,c2150f55becb,CD86
1,1,c2150f55becb,CD274
2,2,c2150f55becb,CD270
3,3,c2150f55becb,CD155
4,4,c2150f55becb,CD112


In [21]:
%%time
eval_ids['target'] = eval_ids.apply(lambda x: test_preds[x['cell_id']][x['gene_id']]
                                    if x['cell_id'] in test_preds.keys() else 0,
                                    axis=1)

CPU times: user 11min 19s, sys: 7.1 s, total: 11min 26s
Wall time: 11min 26s


In [22]:
eval_ids.drop(['cell_id', 'gene_id'], axis=1)\
        .to_csv('cite_sub.csv', index=False)