In [1]:
import os
import os.path
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/open-problems-multimodal'
%ls $DATA_DIR -lh

total 27G
-rw-r--r-- 1 nobody nogroup 2.3G Sep  7 19:44 evaluation_ids.csv
-rw-r--r-- 1 nobody nogroup 9.4M Sep  7 19:43 metadata.csv
-rw-r--r-- 1 nobody nogroup 230K Sep  7 19:43 metadata_cite_day_2_donor_27678.csv
-rw-r--r-- 1 nobody nogroup 805M Sep  7 19:43 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 1.6G Sep  7 19:44 test_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup 294M Sep  7 19:43 test_cite_inputs_day_2_donor_27678.h5
-rw-r--r-- 1 nobody nogroup 6.1G Sep  7 19:45 test_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 2.4G Sep  7 19:44 train_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup  37M Sep  7 19:43 train_cite_targets.h5
-rw-r--r-- 1 nobody nogroup  11G Sep  7 19:46 train_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 3.0G Sep  7 19:45 train_multi_targets.h5


## Read Data

In [5]:
%%time
train_inp = pd.read_hdf(f'{DATA_DIR}/train_cite_inputs.h5')
train_inp_cols = train_inp.columns

CPU times: user 32.8 s, sys: 9.46 s, total: 42.3 s
Wall time: 1min 2s


In [6]:
%%time
test_inp = pd.read_hdf(f'{DATA_DIR}/test_cite_inputs.h5')

CPU times: user 21.9 s, sys: 4.72 s, total: 26.7 s
Wall time: 40.5 s


In [7]:
%%time
train_tar = pd.read_hdf(f'{DATA_DIR}/train_cite_targets.h5')
train_tar_cols = train_tar.columns

CPU times: user 202 ms, sys: 65.8 ms, total: 268 ms
Wall time: 781 ms


## Feature Engineering

### Remove Features

Find and remove columns with all zeroes

In [8]:
%%time
zero_cols = []
for idx, col in enumerate(train_inp_cols, 0):
    if idx % 1000 == 0:
        print(idx)
    if len(train_inp[col].unique()) == 1 or len(test_inp[col].unique()) == 1:
        zero_cols.append(col)
print('Number of columns with zero values only (Train or Test):', 
      len(zero_cols))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
Number of columns with zero values only (Train or Test): 1194
CPU times: user 42.1 s, sys: 190 ms, total: 42.3 s
Wall time: 42.3 s


In [9]:
%%time
train_inp = train_inp.drop(zero_cols, axis=1)
train_inp_cols = train_inp.columns
test_inp = test_inp.drop(zero_cols, axis=1)
train_inp.shape, test_inp.shape

CPU times: user 1.82 s, sys: 2.75 s, total: 4.58 s
Wall time: 4.54 s


((70988, 20856), (48663, 20856))

### Introduce Metadata

In [10]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Nov 14 16:09 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Nov 14 16:09 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Nov 14 16:09 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Nov 14 16:09 custom.css
-rw-r--r-- 1 nobody nogroup 359M Nov 14 16:09 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Nov 14 16:09 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Nov 14 16:09 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Nov 14 16:09 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Nov 14 16:09 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Nov 14 16:09 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Nov 14 16:09 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Nov 14 16:09 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Nov 14 16:09 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

In [11]:
%%time
meta_data = pd.read_parquet(f'{DATA_DIR}/metadata.parquet')
meta_data_cite = meta_data[meta_data['technology'] == 'citeseq']
print(meta_data_cite.shape)
meta_data.head()

(119651, 5)
CPU times: user 233 ms, sys: 143 ms, total: 376 ms
Wall time: 518 ms


Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq


In [12]:
del meta_data
gc.collect()

21

In [13]:
# from sklearn.preprocessing import OneHotEncoder

In [14]:
# cell_type_df = meta_data_cite[['cell_type']].copy()
# ohe = OneHotEncoder()
# cell_type_df = ohe.fit_transform(cell_type_df)
# cell_type_df = pd.DataFrame(cell_type_df.todense(), 
#                             columns=ohe.get_feature_names())
# meta_data_cite = pd.concat([meta_data_cite, cell_type_df], axis=1)
# meta_data_cite.head()

In [15]:
# del cell_type_df
# gc.collect()

In [16]:
# meta_data_cite = meta_data_cite.drop(['day', 'donor',
#                                       'cell_type', 'technology'],
#                                      axis=1)
# cell_ids = meta_data_cite['cell_id']
# meta_data_cite = meta_data_cite.drop('cell_id', axis=1)
# meta_data_cite.index = cell_ids

In [17]:
# meta_data_cols = [col for col in meta_data_cite.columns
#                   if col != 'cell_id']
# meta_data_cols

In [18]:
# meta_data_cite_dict = meta_data_cite.to_dict('index')
# del meta_data_cite, cell_ids
# gc.collect()

In [19]:
train_inp.shape, test_inp.shape

((70988, 20856), (48663, 20856))

In [20]:
donor_dict = {row[0]: row[1]
              for row in meta_data_cite[['cell_id', 'donor']].values} 

In [21]:
train_inp['donor'] = train_inp.index.to_series().map(donor_dict)\
                                                .astype('float32')
test_inp['donor'] = test_inp.index.to_series().map(donor_dict)\
                                              .astype('float32')

In [22]:
del donor_dict
gc.collect()

210

In [23]:
# for col in meta_data_cols:
#     train_inp[col] = train_inp.index.to_series()\
#                               .apply(lambda x: meta_data_cite_dict[x][col])\
#                               .astype('float32')
#     test_inp[col] = test_inp.index.to_series()\
#                             .apply(lambda x: meta_data_cite_dict[x][col])\
#                             .astype('float32')

In [24]:
# del meta_data_cite_dict
# gc.collect()

### Dimension Reduction

Find features in target columns that are related to input columns due to identical names

In [25]:
%%time
same_name_cols = []
for tar_col in train_tar_cols:
    for inp_col in train_inp_cols:
        if tar_col in inp_col:
            same_name_cols.append(inp_col)
print('Number of target columns shown in training columns:', 
      len(same_name_cols))

Number of target columns shown in training columns: 144
CPU times: user 572 ms, sys: 3.82 ms, total: 575 ms
Wall time: 572 ms


Keep identical name columns and transform the rest to sparse matrices

In [26]:
train_inp_sn = train_inp[same_name_cols].values
test_inp_sn = test_inp[same_name_cols].values
train_inp_sn.shape, test_inp_sn.shape

((70988, 144), (48663, 144))

In [27]:
train_inp_dn = train_inp[['donor']].values
test_inp_dn = test_inp[['donor']].values
train_inp_dn.shape, test_inp_dn.shape

((70988, 1), (48663, 1))

In [28]:
# train_inp_ct = train_inp[meta_data_cols].values
# test_inp_ct = test_inp[meta_data_cols].values
# train_inp_ct.shape, test_inp_ct.shape

In [29]:
%%time
train_inp = train_inp.drop(same_name_cols, axis=1)\
                     .drop(['donor'], axis=1)
train_inp = scipy.sparse.csr_matrix(train_inp.values)
test_inp = test_inp.drop(same_name_cols, axis=1)\
                   .drop(['donor'], axis=1)
test_inp = scipy.sparse.csr_matrix(test_inp.values)
gc.collect()

CPU times: user 1min 28s, sys: 13.7 s, total: 1min 42s
Wall time: 1min 42s


147

Reduce dimension

In [30]:
from sklearn.decomposition import TruncatedSVD

In [31]:
%%time
## Baseline
train_inp_tsvd = TruncatedSVD(n_components=512, 
                              random_state=42)

train_inp = train_inp_tsvd.fit_transform(train_inp)
print('Sum of Explained Variance: ',
      np.sum(train_inp_tsvd.explained_variance_ratio_))
test_inp = train_inp_tsvd.transform(test_inp)

Sum of Explained Variance:  0.19528116
CPU times: user 29min 25s, sys: 17 s, total: 29min 42s
Wall time: 29min 18s


### Standardization

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
train_inp = np.hstack([train_inp, train_inp_sn])
test_inp = np.hstack([test_inp, test_inp_sn])

In [34]:
del train_inp_sn, test_inp_sn
gc.collect()

84

In [35]:
%%time
sc = StandardScaler()
train_inp = sc.fit_transform(train_inp)
test_inp = sc.transform(test_inp)

CPU times: user 716 ms, sys: 145 ms, total: 861 ms
Wall time: 861 ms


In [36]:
train_inp = np.hstack([train_inp, train_inp_dn])
test_inp = np.hstack([test_inp, test_inp_dn])

In [37]:
del train_inp_dn, test_inp_dn
gc.collect()

63

In [38]:
train_inp.shape, test_inp.shape

((70988, 657), (48663, 657))

In [39]:
%%time
dump_pickle(train_inp, 'train_inp')
dump_pickle(test_inp, 'test_inp')

CPU times: user 155 ms, sys: 657 ms, total: 811 ms
Wall time: 812 ms
