In [1]:
import os
import os.path
import gc
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/open-problems-multimodal'
%ls $DATA_DIR -lh

total 27G
-rw-r--r-- 1 nobody nogroup 2.3G Sep  7 20:28 evaluation_ids.csv
-rw-r--r-- 1 nobody nogroup 9.4M Sep  7 20:28 metadata.csv
-rw-r--r-- 1 nobody nogroup 230K Sep  7 20:28 metadata_cite_day_2_donor_27678.csv
-rw-r--r-- 1 nobody nogroup 805M Sep  7 20:28 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 1.6G Sep  7 20:28 test_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup 294M Sep  7 20:28 test_cite_inputs_day_2_donor_27678.h5
-rw-r--r-- 1 nobody nogroup 6.1G Sep  7 20:29 test_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 2.4G Sep  7 20:29 train_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup  37M Sep  7 20:28 train_cite_targets.h5
-rw-r--r-- 1 nobody nogroup  11G Sep  7 20:30 train_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 3.0G Sep  7 20:29 train_multi_targets.h5


## Read Data

In [5]:
%%time
train_inp = pd.read_hdf(f'{DATA_DIR}/train_cite_inputs.h5')
train_inp_cols = train_inp.columns

CPU times: user 25.2 s, sys: 5.02 s, total: 30.2 s
Wall time: 49.7 s


In [6]:
%%time
test_inp = pd.read_hdf(f'{DATA_DIR}/test_cite_inputs.h5')

CPU times: user 17.2 s, sys: 2.76 s, total: 20 s
Wall time: 33.8 s


In [7]:
%%time
train_tar = pd.read_hdf(f'{DATA_DIR}/train_cite_targets.h5')
train_tar_cols = train_tar.columns

CPU times: user 160 ms, sys: 35.9 ms, total: 196 ms
Wall time: 574 ms


## Feature Engineering

### Remove Features

Find and remove columns with all zeroes

In [8]:
%%time
zero_cols = []
for idx, col in enumerate(train_inp_cols, 0):
    if idx % 1000 == 0:
        print(idx)
    if len(train_inp[col].unique()) == 1 or len(test_inp[col].unique()) == 1:
        zero_cols.append(col)
print('Number of columns with zero values only (Train or Test):', 
      len(zero_cols))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
Number of columns with zero values only (Train or Test): 1194
CPU times: user 36.2 s, sys: 154 ms, total: 36.3 s
Wall time: 36.3 s


In [9]:
%%time
train_inp = train_inp.drop(zero_cols, axis=1)
train_inp_cols = train_inp.columns
test_inp = test_inp.drop(zero_cols, axis=1)
train_inp.shape, test_inp.shape

CPU times: user 1.17 s, sys: 1.97 s, total: 3.13 s
Wall time: 3.12 s


((70988, 20856), (48663, 20856))

### Introduce Metadata

In [10]:
DATA_DIR = '../input/msci-h5-sparse-transform'
%ls $DATA_DIR -lh

total 7.1G
-rw-r--r-- 1 nobody nogroup  25K Nov  2 12:50 __notebook__.ipynb
-rw-r--r-- 1 nobody nogroup  25K Nov  2 12:50 __output__.json
-rw-r--r-- 1 nobody nogroup 293K Nov  2 12:50 __results__.html
-rw-r--r-- 1 nobody nogroup    0 Nov  2 12:50 custom.css
-rw-r--r-- 1 nobody nogroup 359M Nov  2 12:50 evaluation_ids.parquet
-rw-r--r-- 1 nobody nogroup 3.8M Nov  2 12:50 metadata.parquet
-rw-r--r-- 1 nobody nogroup 108K Nov  2 12:50 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 nobody nogroup 252M Nov  2 12:50 sample_submission.parquet
-rw-r--r-- 1 nobody nogroup 856K Nov  2 12:50 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 nobody nogroup  78M Nov  2 12:50 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 1.8M Nov  2 12:50 test_cite_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 488M Nov  2 12:51 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 nobody nogroup 8.4M Nov  2 12:50 test_multi_inputs_idx.npz
-rw-r--r-- 1 nobody nogroup 1.7G

In [11]:
%%time
meta_data = pd.read_parquet(f'{DATA_DIR}/metadata.parquet')
meta_data_cite = meta_data[meta_data['technology'] == 'citeseq']
print(meta_data_cite.shape)
meta_data.head()

(119651, 5)
CPU times: user 170 ms, sys: 90.8 ms, total: 260 ms
Wall time: 347 ms


Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq


In [12]:
del meta_data
gc.collect()

21

In [13]:
# from sklearn.preprocessing import OneHotEncoder

In [14]:
# cell_type_df = meta_data_cite[['cell_type']].copy()
# ohe = OneHotEncoder()
# cell_type_df = ohe.fit_transform(cell_type_df)
# cell_type_df = pd.DataFrame(cell_type_df.todense(), 
#                             columns=ohe.get_feature_names())
# meta_data_cite = pd.concat([meta_data_cite, cell_type_df], axis=1)
# meta_data_cite.head()

In [15]:
# del cell_type_df
# gc.collect()

In [16]:
# meta_data_cite = meta_data_cite.drop(['day', 'donor',
#                                       'cell_type', 'technology'],
#                                      axis=1)
# cell_ids = meta_data_cite['cell_id']
# meta_data_cite = meta_data_cite.drop('cell_id', axis=1)
# meta_data_cite.index = cell_ids

In [17]:
# meta_data_cols = [col for col in meta_data_cite.columns
#                   if col != 'cell_id']
# meta_data_cols

In [18]:
# meta_data_cite_dict = meta_data_cite.to_dict('index')
# del meta_data_cite, cell_ids
# gc.collect()

In [19]:
train_inp.shape, test_inp.shape

((70988, 20856), (48663, 20856))

In [20]:
donor_dict = {row[0]: row[1]
              for row in meta_data_cite[['cell_id', 'donor']].values} 

In [21]:
train_inp['donor'] = train_inp.index.to_series().map(donor_dict)\
                                                .astype('float32')
test_inp['donor'] = test_inp.index.to_series().map(donor_dict)\
                                              .astype('float32')

In [22]:
del donor_dict
gc.collect()

210

In [23]:
# for col in meta_data_cols:
#     train_inp[col] = train_inp.index.to_series()\
#                               .apply(lambda x: meta_data_cite_dict[x][col])\
#                               .astype('float32')
#     test_inp[col] = test_inp.index.to_series()\
#                             .apply(lambda x: meta_data_cite_dict[x][col])\
#                             .astype('float32')

In [24]:
# del meta_data_cite_dict
# gc.collect()

### Dimension Reduction

Find features in target columns that are related to input columns due to identical names

In [25]:
# %%time
# same_name_cols = []
# for tar_col in train_tar_cols:
#     for inp_col in train_inp_cols:
#         if tar_col in inp_col:
#             same_name_cols.append(inp_col)
# print('Number of target columns shown in training columns:', 
#       len(same_name_cols))

Keep identical name columns and transform the rest to sparse matrices

In [26]:
# https://www.kaggle.com/code/pourchot/all-in-one-citeseq-multiome-with-keras/notebook
same_name_cols = ['ENSG00000135218_CD36',
 'ENSG00000010278_CD9',
 'ENSG00000204287_HLA-DRA',
 'ENSG00000117091_CD48',
 'ENSG00000004468_CD38',
 'ENSG00000173762_CD7',
 'ENSG00000137101_CD72',
 'ENSG00000019582_CD74',
 'ENSG00000169442_CD52',
 'ENSG00000170458_CD14',
 'ENSG00000272398_CD24',
 'ENSG00000026508_CD44',
 'ENSG00000114013_CD86',
 'ENSG00000174059_CD34',
 'ENSG00000139193_CD27',
 'ENSG00000105383_CD33',
 'ENSG00000085117_CD82',
 'ENSG00000177455_CD19',
 'ENSG00000002586_CD99',
 'ENSG00000196126_HLA-DRB1',
 'ENSG00000135404_CD63',
 'ENSG00000012124_CD22',
 'ENSG00000134061_CD180',
 'ENSG00000105369_CD79A',
 'ENSG00000116824_CD2',
 'ENSG00000010610_CD4',
 'ENSG00000139187_KLRG1',
 'ENSG00000204592_HLA-E',
 'ENSG00000090470_PDCD7',
 'ENSG00000206531_CD200R1L',
'ENSG00000166710_B2M',
 'ENSG00000198034_RPS4X',
'ENSG00000188404_SELL',
 'ENSG00000130303_BST2',
 'ENSG00000128040_SPINK2',
 'ENSG00000206503_HLA-A',
 'ENSG00000108107_RPL28',
 'ENSG00000143226_FCGR2A',
 'ENSG00000133112_TPT1',
 'ENSG00000166091_CMTM5',
 'ENSG00000026025_VIM',
 'ENSG00000205542_TMSB4X',
 'ENSG00000109099_PMP22',
 'ENSG00000145425_RPS3A',
 'ENSG00000172247_C1QTNF4',
 'ENSG00000072274_TFRC',
 'ENSG00000234745_HLA-B',
 'ENSG00000075340_ADD2',
 'ENSG00000119865_CNRIP1',
 'ENSG00000198938_MT-CO3',
 'ENSG00000135046_ANXA1',
 'ENSG00000235169_SMIM1',
 'ENSG00000101200_AVP',
 'ENSG00000167996_FTH1',
 'ENSG00000163565_IFI16',
 'ENSG00000117450_PRDX1',
 'ENSG00000124570_SERPINB6',
 'ENSG00000112077_RHAG',
 'ENSG00000051523_CYBA',
'ENSG00000107130_NCS1',
 'ENSG00000055118_KCNH2',
 'ENSG00000029534_ANK1',
 'ENSG00000169567_HINT1',
 'ENSG00000142089_IFITM3',
 'ENSG00000139278_GLIPR1',
 'ENSG00000142227_EMP3',
 'ENSG00000076662_ICAM3',
 'ENSG00000143627_PKLR',
 'ENSG00000130755_GMFG',
 'ENSG00000160593_JAML',
 'ENSG00000095932_SMIM24',
 'ENSG00000197956_S100A6',
 'ENSG00000171476_HOPX',
 'ENSG00000116675_DNAJC6',
 'ENSG00000100448_CTSG',
 'ENSG00000100368_CSF2RB',
 'ENSG00000047648_ARHGAP6',
 'ENSG00000198918_RPL39',
 'ENSG00000196154_S100A4',
 'ENSG00000233968_AL157895.1',
 'ENSG00000137642_SORL1',
 'ENSG00000133816_MICAL2',
 'ENSG00000130208_APOC1',
 'ENSG00000105610_KLF1']
len(same_name_cols)

84

In [27]:
train_inp_sn = train_inp[same_name_cols].values
test_inp_sn = test_inp[same_name_cols].values
train_inp_sn.shape, test_inp_sn.shape

((70988, 84), (48663, 84))

In [28]:
train_inp_dn = train_inp[['donor']].values
test_inp_dn = test_inp[['donor']].values
train_inp_dn.shape, test_inp_dn.shape

((70988, 1), (48663, 1))

In [29]:
# train_inp_ct = train_inp[meta_data_cols].values
# test_inp_ct = test_inp[meta_data_cols].values
# train_inp_ct.shape, test_inp_ct.shape

In [30]:
%%time
train_inp = train_inp.drop(same_name_cols, axis=1)\
                     .drop(['donor'], axis=1)
train_inp = scipy.sparse.csr_matrix(train_inp.values)
test_inp = test_inp.drop(same_name_cols, axis=1)\
                   .drop(['donor'], axis=1)
test_inp = scipy.sparse.csr_matrix(test_inp.values)
gc.collect()

CPU times: user 1min 5s, sys: 7.13 s, total: 1min 12s
Wall time: 1min 12s


84

Reduce dimension

In [31]:
from sklearn.decomposition import TruncatedSVD

In [32]:
%%time
## Baseline
train_inp_tsvd = TruncatedSVD(n_components=64, 
                              random_state=42)

train_inp = train_inp_tsvd.fit_transform(train_inp)
print('Sum of Explained Variance: ',
      np.sum(train_inp_tsvd.explained_variance_ratio_))
test_inp = train_inp_tsvd.transform(test_inp)

Sum of Explained Variance:  0.123098195
CPU times: user 2min 15s, sys: 3.41 s, total: 2min 18s
Wall time: 2min 12s


### Standardization

In [33]:
from sklearn.preprocessing import StandardScaler

In [34]:
train_inp = np.hstack([train_inp, train_inp_sn])
test_inp = np.hstack([test_inp, test_inp_sn])

In [35]:
del train_inp_sn, test_inp_sn
gc.collect()

84

In [36]:
%%time
sc = StandardScaler()
train_inp = sc.fit_transform(train_inp)
test_inp = sc.transform(test_inp)

CPU times: user 157 ms, sys: 27.1 ms, total: 184 ms
Wall time: 183 ms


In [37]:
train_inp = np.hstack([train_inp, train_inp_dn])
test_inp = np.hstack([test_inp, test_inp_dn])

In [38]:
del train_inp_dn, test_inp_dn
gc.collect()

63

In [39]:
train_inp.shape, test_inp.shape

((70988, 149), (48663, 149))

In [40]:
%%time
dump_pickle(train_inp, 'train_inp')
dump_pickle(test_inp, 'test_inp')

CPU times: user 21.8 ms, sys: 99 ms, total: 121 ms
Wall time: 120 ms
