In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pickle

def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

In [4]:
DATA_DIR = '../input/open-problems-multimodal'
%ls $DATA_DIR -lh

total 27G
-rw-r--r-- 1 nobody nogroup 2.3G Sep  7 21:12 evaluation_ids.csv
-rw-r--r-- 1 nobody nogroup 9.4M Sep  7 21:10 metadata.csv
-rw-r--r-- 1 nobody nogroup 230K Sep  7 21:10 metadata_cite_day_2_donor_27678.csv
-rw-r--r-- 1 nobody nogroup 805M Sep  7 21:11 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 1.6G Sep  7 21:12 test_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup 294M Sep  7 21:10 test_cite_inputs_day_2_donor_27678.h5
-rw-r--r-- 1 nobody nogroup 6.1G Sep  7 21:13 test_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 2.4G Sep  7 21:12 train_cite_inputs.h5
-rw-r--r-- 1 nobody nogroup  37M Sep  7 21:10 train_cite_targets.h5
-rw-r--r-- 1 nobody nogroup  11G Sep  7 21:13 train_multi_inputs.h5
-rw-r--r-- 1 nobody nogroup 3.0G Sep  7 21:12 train_multi_targets.h5


## CSV --> Parquet

In [5]:
def convert_csv_to_parquet(filename, 
                           DATA_DIR=DATA_DIR):
    # Rewrite based on the following notebook
    # https://www.kaggle.com/code/fabiencrom/multimodal-single-cell-creating-sparse-data/
    
    print(f'### Start | {filename} ###')
    
    filepath = f'{DATA_DIR}/{filename}'
    outfilename = filename.replace('.csv', '.parquet')
    pd.read_csv(filepath).to_parquet(outfilename)
    
    print('### End ###')
    print()

In [6]:
csv_files = [f for f in os.listdir(DATA_DIR)
             if '.csv' in f]
csv_files

['sample_submission.csv',
 'metadata_cite_day_2_donor_27678.csv',
 'evaluation_ids.csv',
 'metadata.csv']

In [7]:
%%time
for filename in csv_files:
    convert_csv_to_parquet(filename)

### Start | sample_submission.csv ###
### End ###

### Start | metadata_cite_day_2_donor_27678.csv ###
### End ###

### Start | evaluation_ids.csv ###
### End ###

### Start | metadata.csv ###
### End ###

CPU times: user 1min, sys: 10.9 s, total: 1min 10s
Wall time: 1min 46s


## H5 --> Sparse 

In [8]:
import scipy.sparse

In [9]:
def convert_h5_to_sparse_csr(filename, 
                             chunksize=2500, 
                             DATA_DIR=DATA_DIR):
    # Rewrite based on the following notebook
    # https://www.kaggle.com/code/fabiencrom/multimodal-single-cell-creating-sparse-data/
    
    filepath = f'{DATA_DIR}/{filename}'
    
    start = 0
    total_rows = 0
    
    chunks_val_list = []
    chunks_idx_list = []
    cols = None
    
    print(f'### Start | {filename} ###')
    print('Start Reading...')
    
    while True:
        chunk_df = pd.read_hdf(filepath, 
                               start=start,
                               stop=start+chunksize)
        chunk_nrows = len(chunk_df)
        
        # Stop
        if chunk_nrows == 0:
            break
        
        chunk_val = scipy.sparse.csr_matrix(chunk_df.to_numpy())
        chunks_val_list.append(chunk_val)
        chunk_idx = chunk_df.index.to_numpy()
        chunks_idx_list.append(chunk_idx)
        
        if cols is None:
            cols = chunk_df.columns.to_numpy()
        else:
            assert np.all(cols==chunk_df.columns.to_numpy())
        
        total_rows += chunk_nrows
        print(f'We are at row {total_rows}')
        
        del chunk_df
        
        # Stop
        if chunk_nrows < chunksize:
            break
        
        # Update start
        start+=chunksize
        
    print('Done.')
    print('Start saving files...')    
    
    outfilename = filename.replace('.h5', '')
    
    scipy.sparse.save_npz(f'{outfilename}_val.sparse',
                          scipy.sparse.vstack(chunks_val_list))
    del chunks_val_list
    
    np.savez(f'{outfilename}_idx.npz',
             index=np.hstack(chunks_idx_list),
             columns=cols)
    del chunks_idx_list
    
    print('Done.')
    print('### End ###')
    print()
    

In [10]:
h5_files = [f for f in os.listdir(DATA_DIR)
            if '.h5' in f]
h5_files

['train_cite_targets.h5',
 'test_multi_inputs.h5',
 'train_cite_inputs.h5',
 'train_multi_targets.h5',
 'train_multi_inputs.h5',
 'test_cite_inputs_day_2_donor_27678.h5',
 'test_cite_inputs.h5']

In [11]:
for filename in h5_files:
    convert_h5_to_sparse_csr(filename)

### Start | train_cite_targets.h5 ###
Start Reading...
We are at row 2500
We are at row 5000
We are at row 7500
We are at row 10000
We are at row 12500
We are at row 15000
We are at row 17500
We are at row 20000
We are at row 22500
We are at row 25000
We are at row 27500
We are at row 30000
We are at row 32500
We are at row 35000
We are at row 37500
We are at row 40000
We are at row 42500
We are at row 45000
We are at row 47500
We are at row 50000
We are at row 52500
We are at row 55000
We are at row 57500
We are at row 60000
We are at row 62500
We are at row 65000
We are at row 67500
We are at row 70000
We are at row 70988
Done.
Start saving files...
Done.
### End ###

### Start | test_multi_inputs.h5 ###
Start Reading...
We are at row 2500
We are at row 5000
We are at row 7500
We are at row 10000
We are at row 12500
We are at row 15000
We are at row 17500
We are at row 20000
We are at row 22500
We are at row 25000
We are at row 27500
We are at row 30000
We are at row 32500
We are at 

## Summary

In [12]:
%ls -lh

total 7.1G
---------- 1 root root  38K Oct 24 17:40 __notebook__.ipynb
-rw-r--r-- 1 root root 359M Oct 24 16:44 evaluation_ids.parquet
-rw-r--r-- 1 root root 3.8M Oct 24 16:44 metadata.parquet
-rw-r--r-- 1 root root 108K Oct 24 16:43 metadata_cite_day_2_donor_27678.parquet
-rw-r--r-- 1 root root 252M Oct 24 16:43 sample_submission.parquet
-rw-r--r-- 1 root root 856K Oct 24 17:36 test_cite_inputs_day_2_donor_27678_idx.npz
-rw-r--r-- 1 root root  78M Oct 24 17:36 test_cite_inputs_day_2_donor_27678_val.sparse.npz
-rw-r--r-- 1 root root 1.8M Oct 24 17:40 test_cite_inputs_idx.npz
-rw-r--r-- 1 root root 488M Oct 24 17:40 test_cite_inputs_val.sparse.npz
-rw-r--r-- 1 root root 8.4M Oct 24 16:57 test_multi_inputs_idx.npz
-rw-r--r-- 1 root root 1.7G Oct 24 16:57 test_multi_inputs_val.sparse.npz
-rw-r--r-- 1 root root 2.2M Oct 24 17:02 train_cite_inputs_idx.npz
-rw-r--r-- 1 root root 712M Oct 24 17:02 train_cite_inputs_val.sparse.npz
-rw-r--r-- 1 root root 1.5M Oct 24 16:44 train_ci