In [1]:
import os
import os.path
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/open-problems-multimodal'
%ls $DATA_DIR -lh

total 27G
-rw-r--r-- 1 nobody nogroup 2.3G Sep  7 19:51 evaluation_ids.csv
-rw-r--r-- 1 nobody nogroup 9.4M Sep  7 19:50 metadata.csv
-rw-r--r-- 1 nobody nogroup 230K Sep  7 19:50 metadata_cite_day_2_donor_27678.csv
-rw-r--r-- 1 nobody nogroup 805M Sep  7 19:50 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 1.6G Sep  7 19:51 test_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup 294M Sep  7 19:50 test_cite_inputs_day_2_donor_27678.h5
-rw-r--r-- 1 nobody nogroup 6.1G Sep  7 19:52 test_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 2.4G Sep  7 19:51 train_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup  37M Sep  7 19:50 train_cite_targets.h5
-rw-r--r-- 1 nobody nogroup  11G Sep  7 19:52 train_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 3.0G Sep  7 19:51 train_multi_targets.h5


## Read Data

In [5]:
%%time
train_inp = pd.read_hdf(f'{DATA_DIR}/train_cite_inputs.h5')
train_inp_cols = train_inp.columns

CPU times: user 28.6 s, sys: 5.7 s, total: 34.3 s
Wall time: 52.8 s


In [6]:
%%time
test_inp = pd.read_hdf(f'{DATA_DIR}/test_cite_inputs.h5')

CPU times: user 20.4 s, sys: 3.66 s, total: 24.1 s
Wall time: 38.3 s


In [7]:
%%time
train_tar = pd.read_hdf(f'{DATA_DIR}/train_cite_targets.h5')
train_tar_cols = train_tar.columns

CPU times: user 178 ms, sys: 50.2 ms, total: 229 ms
Wall time: 714 ms


## Feature Engineering

### Remove Featues

Find and remove columns with all zeroes

In [8]:
%%time
zero_cols = []
for idx, col in enumerate(train_inp_cols, 0):
    if idx % 1000 == 0:
        print(idx)
    if len(train_inp[col].unique()) == 1 or len(test_inp[col].unique()) == 1:
        zero_cols.append(col)
print('Number of columns with zero values only (Train or Test):', 
      len(zero_cols))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
Number of columns with zero values only (Train or Test): 1194
CPU times: user 40.6 s, sys: 422 ms, total: 41 s
Wall time: 41 s


In [9]:
%%time
train_inp = train_inp.drop(zero_cols, axis=1)
train_inp_cols = train_inp.columns
test_inp = test_inp.drop(zero_cols, axis=1)
train_inp.shape, test_inp.shape

CPU times: user 1.29 s, sys: 1.98 s, total: 3.27 s
Wall time: 3.25 s


((70988, 20856), (48663, 20856))

### Introduce Metadata

In [10]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Oct 26 15:46 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Oct 26 15:46 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Oct 26 15:46 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Oct 26 15:46 custom.css
-rw-r--r-- 1 nobody nogroup 359M Oct 26 15:46 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Oct 26 15:46 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Oct 26 15:46 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Oct 26 15:46 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Oct 26 15:46 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Oct 26 15:46 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Oct 26 15:46 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Oct 26 15:46 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Oct 26 15:46 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

In [11]:
%%time
meta_data = pd.read_parquet(f'{DATA_DIR}/metadata.parquet')
meta_data_cite = meta_data[meta_data['technology'] == 'citeseq']
print(meta_data_cite.shape)
meta_data.head()

(119651, 5)
CPU times: user 211 ms, sys: 109 ms, total: 320 ms
Wall time: 505 ms


Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq


In [12]:
del meta_data
gc.collect()

21

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
cell_type_df = meta_data_cite[['cell_type']].copy()
ohe = OneHotEncoder()
cell_type_df = ohe.fit_transform(cell_type_df)
cell_type_df = pd.DataFrame(cell_type_df.todense(), 
                            columns=ohe.get_feature_names())
meta_data_cite = pd.concat([meta_data_cite, cell_type_df], axis=1)
meta_data_cite.head()

Unnamed: 0,cell_id,day,donor,cell_type,technology,x0_BP,x0_EryP,x0_HSC,x0_MasP,x0_MkP,x0_MoP,x0_NeuP
0,c2150f55becb,2,27678,HSC,citeseq,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,65b7edf8a4da,2,27678,HSC,citeseq,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,c1b26cb1057b,2,27678,EryP,citeseq,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,917168fa6f83,2,27678,NeuP,citeseq,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2b29feeca86d,2,27678,EryP,citeseq,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
del cell_type_df
gc.collect()

21

In [16]:
meta_data_cite = meta_data_cite.drop(['day', 'donor', 
                                      'cell_type', 'technology'],
                                     axis=1)
cell_ids = meta_data_cite['cell_id']
meta_data_cite = meta_data_cite.drop('cell_id', axis=1)
meta_data_cite.index = cell_ids

In [17]:
meta_data_cols = [col for col in meta_data_cite.columns
                  if col != 'cell_id']
meta_data_cols

['x0_BP', 'x0_EryP', 'x0_HSC', 'x0_MasP', 'x0_MkP', 'x0_MoP', 'x0_NeuP']

In [18]:
meta_data_cite_dict = meta_data_cite.to_dict('index')
del meta_data_cite, cell_ids
gc.collect()

0

In [19]:
train_inp.shape, test_inp.shape

((70988, 20856), (48663, 20856))

In [20]:
for col in meta_data_cols:
    train_inp[col] = train_inp.index.to_series()\
                              .apply(lambda x: meta_data_cite_dict[x][col])\
                              .astype('float32')
    test_inp[col] = test_inp.index.to_series()\
                            .apply(lambda x: meta_data_cite_dict[x][col])\
                            .astype('float32')

In [21]:
del meta_data_cite_dict
gc.collect()

63

### Dimension Reduction

Find features in target columns that are related to input columns due to identical names

In [22]:
%%time
same_name_cols = []
for tar_col in train_tar_cols:
    for inp_col in train_inp_cols:
        if tar_col in inp_col:
            same_name_cols.append(inp_col)
print('Number of target columns shown in training columns:', 
      len(same_name_cols))

Number of target columns shown in training columns: 144
CPU times: user 516 ms, sys: 0 ns, total: 516 ms
Wall time: 516 ms


Keep identical name columns and transform the rest to sparse matrices

In [23]:
train_inp_sn = train_inp[same_name_cols].values
test_inp_sn = test_inp[same_name_cols].values
train_inp_sn.shape, test_inp_sn.shape

((70988, 144), (48663, 144))

In [24]:
train_inp_ct = train_inp[meta_data_cols].values
test_inp_ct = test_inp[meta_data_cols].values
train_inp_ct.shape, test_inp_ct.shape

((70988, 7), (48663, 7))

In [25]:
%%time
train_inp = train_inp.drop(same_name_cols, axis=1)\
                     .drop(meta_data_cols, axis=1)
train_inp = scipy.sparse.csr_matrix(train_inp.values)
test_inp = test_inp.drop(same_name_cols, axis=1)\
                   .drop(meta_data_cols, axis=1)
test_inp = scipy.sparse.csr_matrix(test_inp.values)
gc.collect()

CPU times: user 1min 15s, sys: 7.46 s, total: 1min 23s
Wall time: 1min 23s


84

Reduce dimension

In [26]:
from sklearn.decomposition import PCA, TruncatedSVD

In [27]:
%%time
## Baseline
train_inp_tsvd = TruncatedSVD(n_components=512, 
                              random_state=42)

train_inp = train_inp_tsvd.fit_transform(train_inp)
print('Sum of Explained Variance: ',
      np.sum(train_inp_tsvd.explained_variance_ratio_))
train_inp = np.hstack([train_inp, train_inp_sn, train_inp_ct])
del train_inp_sn
# del train_inp_ct
gc.collect()

Sum of Explained Variance:  0.19528116
CPU times: user 21min 25s, sys: 14.1 s, total: 21min 39s
Wall time: 21min 14s


21

In [28]:
test_inp = train_inp_tsvd.transform(test_inp)
test_inp = np.hstack([test_inp, test_inp_sn, test_inp_ct])
del test_inp_sn
# del test_inp_ct
gc.collect()

21

In [29]:
train_inp.shape, test_inp.shape

((70988, 663), (48663, 663))

## Modeling

### Loss Function

In [30]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    It is assumed that the predictions are not constant.
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

### Modeling

In [31]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import train_test_split

In [32]:
%%time
X_train, X_val, y_train, y_val = train_test_split(train_inp,
                                                  train_tar,
                                                  test_size=0.2,
                                                  random_state=42)

CPU times: user 103 ms, sys: 38 ms, total: 141 ms
Wall time: 140 ms


In [33]:
del train_inp, train_tar
gc.collect()

42

In [34]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

### KNN

In [35]:
# %%time
# knn = KNeighborsRegressor(n_neighbors=9)
# knn.fit(norm.transform(X_train), y_train)

In [36]:
# %%time
# # dim=128; w/o cell type; Res: .9080 
# # dim=128; w/o cell type; Normalizer; Res: .9080
# # dim=128; w/o cell type; Standard Scaler; Res: .8946
# # dim=128; w/o cell type; MinMax Scaler; Res: .8885
# # dim=128; w/ cell type; Res: .9079

# # correlation_score(y_train, knn.predict(X_train))
# correlation_score(y_train, knn.predict(norm.transform(X_train)))

In [37]:
# %%time
# # dim=128; w/o cell type; Res: .8858 
# # dim=128; w/o cell type; Normalizer; Res: .8863
# # dim=128; w/o cell type; Standard Scaler; Res: .8713
# # dim=128; w/o cell type; MinMax Scaler; Res: .8624
# # dim=128; w/ cell type; Res: .8857

# # correlation_score(y_val, knn.predict(X_val))
# correlation_score(y_val, knn.predict(norm.transform(X_val)))

### Ridge

In [38]:
%%time
ridge = Ridge(random_state=42)
ridge.fit(X_train, y_train)

CPU times: user 1.66 s, sys: 270 ms, total: 1.93 s
Wall time: 625 ms


Ridge(random_state=42)

In [39]:
%%time
# dim=128; w/ cell type; SC; Res .8933
correlation_score(y_train, ridge.predict(X_train))

CPU times: user 6.55 s, sys: 237 ms, total: 6.79 s
Wall time: 6.12 s


0.8947031991231116

In [40]:
%%time
# dim=128; w/ cell type; SC; Res: .8930

correlation_score(y_val, ridge.predict(X_val))

CPU times: user 1.8 s, sys: 227 ms, total: 2.03 s
Wall time: 1.57 s


0.8929819672008326

In [41]:
del X_train, X_val, y_train, y_val
gc.collect()

168

## Prediction 

In [42]:
%%time
test_tar_preds = ridge.predict(sc.transform(test_inp))
test_tar_preds.shape

CPU times: user 613 ms, sys: 82 ms, total: 695 ms
Wall time: 337 ms


(48663, 140)

In [43]:
del test_inp
gc.collect()

42

## Creating Submission

In [44]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Oct 26 15:46 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Oct 26 15:46 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Oct 26 15:46 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Oct 26 15:46 custom.css
-rw-r--r-- 1 nobody nogroup 359M Oct 26 15:46 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Oct 26 15:46 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Oct 26 15:46 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Oct 26 15:46 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Oct 26 15:46 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Oct 26 15:46 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Oct 26 15:46 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Oct 26 15:46 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Oct 26 15:46 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

In [45]:
test_tar_cols = np.load(f'{DATA_DIR}/train_cite_targets_idx.npz',
                        allow_pickle=True)['columns']
test_tar_idx = np.load(f'{DATA_DIR}/test_cite_inputs_idx.npz',
                       allow_pickle=True)['index']
test_tar_cols.shape, test_tar_idx.shape, test_tar_preds.shape

((140,), (48663,), (48663, 140))

In [46]:
%%time
print('Start Eval...')
eval_ids = pd.read_parquet(f'{DATA_DIR}/evaluation_ids.parquet')
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

Start Eval...
CPU times: user 28.8 s, sys: 10.6 s, total: 39.4 s
Wall time: 35.1 s


In [47]:
%%time
sub = pd.Series(name='target',
                index=pd.MultiIndex.from_frame(eval_ids), 
                dtype=np.float32)
sub

CPU times: user 19.7 s, sys: 3.49 s, total: 23.2 s
Wall time: 23.2 s


row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [48]:
cell_id_dict = {cell_id: idx 
                for idx, cell_id in enumerate(test_tar_idx, 0)}
gene_id_dict = {gene_id: idx 
                for idx, gene_id in enumerate(test_tar_cols, 0)}

In [49]:
eid_cid_idx = eval_ids['cell_id']\
              .apply(lambda x: cell_id_dict.get(x, -1))
eid_gid_idx = eval_ids['gene_id']\
              .apply(lambda x: gene_id_dict.get(x, -1))
valid_cite_rows = (eid_cid_idx != -1) & (eid_gid_idx != -1)

In [50]:
%%time
sub.iloc[valid_cite_rows] = test_tar_preds\
                             [eid_cid_idx[valid_cite_rows].to_numpy(),
                              eid_gid_idx[valid_cite_rows].to_numpy()]

CPU times: user 282 ms, sys: 162 ms, total: 444 ms
Wall time: 443 ms


In [51]:
del eval_ids, test_tar_idx, test_tar_cols
del eid_cid_idx, eid_gid_idx, valid_cite_rows
gc.collect()

97

In [52]:
sub = pd.DataFrame(sub).fillna(0).reset_index()
sub.drop(['cell_id', 'gene_id'], axis=1)\
   .to_csv('cite_sub.csv', index=False)
sub.head()

Unnamed: 0,row_id,cell_id,gene_id,target
0,0,c2150f55becb,CD86,0.352822
1,1,c2150f55becb,CD274,0.506273
2,2,c2150f55becb,CD270,0.85715
3,3,c2150f55becb,CD155,4.582359
4,4,c2150f55becb,CD112,5.47989
