In [1]:
import os
import os.path
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/open-problems-multimodal'
%ls $DATA_DIR -lh

total 27G
-rw-r--r-- 1 nobody nogroup 2.3G Sep  7 19:44 evaluation_ids.csv
-rw-r--r-- 1 nobody nogroup 9.4M Sep  7 19:43 metadata.csv
-rw-r--r-- 1 nobody nogroup 230K Sep  7 19:43 metadata_cite_day_2_donor_27678.csv
-rw-r--r-- 1 nobody nogroup 805M Sep  7 19:43 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 1.6G Sep  7 19:44 test_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup 294M Sep  7 19:43 test_cite_inputs_day_2_donor_27678.h5
-rw-r--r-- 1 nobody nogroup 6.1G Sep  7 19:45 test_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 2.4G Sep  7 19:44 train_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup  37M Sep  7 19:43 train_cite_targets.h5
-rw-r--r-- 1 nobody nogroup  11G Sep  7 19:46 train_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 3.0G Sep  7 19:45 train_multi_targets.h5


## Read Data

In [5]:
%%time
train_inp = pd.read_hdf(f'{DATA_DIR}/train_cite_inputs.h5')
train_inp_cols = train_inp.columns

CPU times: user 29.3 s, sys: 6.92 s, total: 36.3 s
Wall time: 56.3 s


In [6]:
%%time
test_inp = pd.read_hdf(f'{DATA_DIR}/test_cite_inputs.h5')

CPU times: user 20.3 s, sys: 4.05 s, total: 24.4 s
Wall time: 38.6 s


In [7]:
%%time
train_tar = pd.read_hdf(f'{DATA_DIR}/train_cite_targets.h5')
train_tar_cols = train_tar.columns

CPU times: user 183 ms, sys: 43.2 ms, total: 226 ms
Wall time: 760 ms


## Feature Engineering

### Remove Features

Find and remove columns with all zeroes

In [8]:
%%time
zero_cols = []
for idx, col in enumerate(train_inp_cols, 0):
    if idx % 1000 == 0:
        print(idx)
    if len(train_inp[col].unique()) == 1 or len(test_inp[col].unique()) == 1:
        zero_cols.append(col)
print('Number of columns with zero values only (Train or Test):', 
      len(zero_cols))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
Number of columns with zero values only (Train or Test): 1194
CPU times: user 40.1 s, sys: 442 ms, total: 40.6 s
Wall time: 40.6 s


In [9]:
%%time
train_inp = train_inp.drop(zero_cols, axis=1)
train_inp_cols = train_inp.columns
test_inp = test_inp.drop(zero_cols, axis=1)
train_inp.shape, test_inp.shape

CPU times: user 1.38 s, sys: 2.01 s, total: 3.38 s
Wall time: 3.36 s


((70988, 20856), (48663, 20856))

### Introduce Metadata

In [10]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Oct 25 09:30 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Oct 25 09:30 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Oct 25 09:30 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Oct 25 09:30 custom.css
-rw-r--r-- 1 nobody nogroup 359M Oct 25 09:30 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Oct 25 09:30 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Oct 25 09:30 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Oct 25 09:30 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Oct 25 09:30 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Oct 25 09:30 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Oct 25 09:30 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Oct 25 09:31 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Oct 25 09:30 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

In [11]:
%%time
meta_data = pd.read_parquet(f'{DATA_DIR}/metadata.parquet')
meta_data_cite = meta_data[meta_data['technology'] == 'citeseq']
print(meta_data_cite.shape)
meta_data.head()

(119651, 5)
CPU times: user 220 ms, sys: 135 ms, total: 355 ms
Wall time: 438 ms


Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq


In [12]:
del meta_data
gc.collect()

21

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
cell_type_df = meta_data_cite[['cell_type']].copy()
ohe = OneHotEncoder()
cell_type_df = ohe.fit_transform(cell_type_df)
cell_type_df = pd.DataFrame(cell_type_df.todense(), 
                            columns=ohe.get_feature_names())
meta_data_cite = pd.concat([meta_data_cite, cell_type_df], axis=1)
meta_data_cite.head()

Unnamed: 0,cell_id,day,donor,cell_type,technology,x0_BP,x0_EryP,x0_HSC,x0_MasP,x0_MkP,x0_MoP,x0_NeuP
0,c2150f55becb,2,27678,HSC,citeseq,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,65b7edf8a4da,2,27678,HSC,citeseq,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,c1b26cb1057b,2,27678,EryP,citeseq,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,917168fa6f83,2,27678,NeuP,citeseq,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2b29feeca86d,2,27678,EryP,citeseq,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
del cell_type_df
gc.collect()

21

In [16]:
meta_data_cite = meta_data_cite.drop(['day', 'donor', 
                                      'cell_type', 'technology'],
                                     axis=1)
cell_ids = meta_data_cite['cell_id']
meta_data_cite = meta_data_cite.drop('cell_id', axis=1)
meta_data_cite.index = cell_ids

In [17]:
meta_data_cols = [col for col in meta_data_cite.columns
                  if col != 'cell_id']
meta_data_cols

['x0_BP', 'x0_EryP', 'x0_HSC', 'x0_MasP', 'x0_MkP', 'x0_MoP', 'x0_NeuP']

In [18]:
meta_data_cite_dict = meta_data_cite.to_dict('index')
del meta_data_cite, cell_ids
gc.collect()

0

In [19]:
train_inp.shape, test_inp.shape

((70988, 20856), (48663, 20856))

In [20]:
for col in meta_data_cols:
    train_inp[col] = train_inp.index.to_series()\
                              .apply(lambda x: meta_data_cite_dict[x][col])\
                              .astype('float32')
    test_inp[col] = test_inp.index.to_series()\
                            .apply(lambda x: meta_data_cite_dict[x][col])\
                            .astype('float32')

In [21]:
del meta_data_cite_dict
gc.collect()

63

### Dimension Reduction

Find features in target columns that are related to input columns due to identical names

In [22]:
%%time
same_name_cols = []
for tar_col in train_tar_cols:
    for inp_col in train_inp_cols:
        if tar_col in inp_col:
            same_name_cols.append(inp_col)
print('Number of target columns shown in training columns:', 
      len(same_name_cols))

Number of target columns shown in training columns: 144
CPU times: user 614 ms, sys: 0 ns, total: 614 ms
Wall time: 616 ms


Keep identical name columns and transform the rest to sparse matrices

In [23]:
train_inp_sn = train_inp[same_name_cols].values
test_inp_sn = test_inp[same_name_cols].values
train_inp_sn.shape, test_inp_sn.shape

((70988, 144), (48663, 144))

In [24]:
train_inp_ct = train_inp[meta_data_cols].values
test_inp_ct = test_inp[meta_data_cols].values
train_inp_ct.shape, test_inp_ct.shape

((70988, 7), (48663, 7))

In [25]:
%%time
train_inp = train_inp.drop(same_name_cols, axis=1)\
                     .drop(meta_data_cols, axis=1)
train_inp = scipy.sparse.csr_matrix(train_inp.values)
test_inp = test_inp.drop(same_name_cols, axis=1)\
                   .drop(meta_data_cols, axis=1)
test_inp = scipy.sparse.csr_matrix(test_inp.values)
gc.collect()

CPU times: user 1min 17s, sys: 8.85 s, total: 1min 26s
Wall time: 1min 26s


84

Reduce dimension

In [26]:
from sklearn.decomposition import TruncatedSVD

In [27]:
%%time
## Baseline
train_inp_tsvd = TruncatedSVD(n_components=512, 
                              random_state=42)

train_inp = train_inp_tsvd.fit_transform(train_inp)
print('Sum of Explained Variance: ',
      np.sum(train_inp_tsvd.explained_variance_ratio_))
test_inp = train_inp_tsvd.transform(test_inp)

Sum of Explained Variance:  0.19528116
CPU times: user 23min 20s, sys: 16.8 s, total: 23min 36s
Wall time: 23min 6s


### Standardization

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
train_inp = np.hstack([train_inp, train_inp_sn])
test_inp = np.hstack([test_inp, test_inp_sn])

In [30]:
del train_inp_sn, test_inp_sn
gc.collect()

84

In [31]:
%%time
sc = StandardScaler()
train_inp = sc.fit_transform(train_inp)
test_inp = sc.transform(test_inp)

CPU times: user 720 ms, sys: 151 ms, total: 871 ms
Wall time: 870 ms


In [32]:
train_inp = np.hstack([train_inp, train_inp_ct])
test_inp = np.hstack([test_inp, test_inp_ct])

In [33]:
del train_inp_ct, test_inp_ct
gc.collect()

63

In [34]:
train_inp.shape, test_inp.shape

((70988, 663), (48663, 663))

In [35]:
%%time
dump_pickle(train_inp, 'train_inp')
dump_pickle(test_inp, 'test_inp')

CPU times: user 121 ms, sys: 683 ms, total: 803 ms
Wall time: 802 ms
