In [3]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
import warnings 
warnings.filterwarnings('ignore')

In [5]:
DATA_DIR = '../input/open-problems-multimodal'
%ls $DATA_DIR

evaluation_ids.csv                     test_multi_inputs.h5
metadata.csv                           train_cite_inputs.h5
metadata_cite_day_2_donor_27678.csv    train_cite_targets.h5
sample_submission.csv                  train_multi_inputs.h5
test_cite_inputs.h5                    train_multi_targets.h5
test_cite_inputs_day_2_donor_27678.h5


## metadata.csv

In [6]:
metadata = pd.read_csv(f'{DATA_DIR}/metadata.csv')
print(metadata.shape)
metadata.head()

(281528, 5)


Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq


- `cell_id`: A unique identifier for each observed cell.
- `donor`: An identifier for the four cell donors.
- `day`: The day of the experiment the observation was made.
- `technology`: Either `citeseq` or `multiome`.
- `cell_type`: One of the above cell types or else `hidden`.

In [7]:
metadata.day.unique()

array([ 2,  3,  4,  7, 10])

In [8]:
metadata.donor.unique()

array([27678, 32606, 13176, 31800])

In [9]:
metadata_citeseq = metadata[metadata['technology'] == 'citeseq']
metadata_citeseq.shape

(119651, 5)

In [10]:
metadata_citeseq.groupby(['day'])[['cell_id']].count()

Unnamed: 0_level_0,cell_id
day,Unnamed: 1_level_1
2,29418
3,27389
4,35977
7,26867


In [16]:
metadata_citeseq.groupby(['donor'])[['cell_id']].count()

Unnamed: 0_level_0,cell_id
donor,Unnamed: 1_level_1
13176,29394
27678,28043
31800,30974
32606,31240


In [11]:
metadata_citeseq.groupby(['day', 'cell_type'])[['cell_id']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,cell_id
day,cell_type,Unnamed: 2_level_1
2,BP,154
2,EryP,4885
2,HSC,14296
2,MasP,2617
2,MkP,2834
2,MoP,80
2,NeuP,4552
3,BP,17
3,EryP,5925
3,HSC,11773


In [12]:
metadata_multiome = metadata[metadata['technology'] == 'multiome']
metadata_multiome.shape

(161877, 5)

In [13]:
metadata_multiome.groupby(['day'])[['cell_id']].count()

Unnamed: 0_level_0,cell_id
day,Unnamed: 1_level_1
2,32832
3,36765
4,31134
7,29373
10,31773


In [17]:
metadata_multiome.groupby(['donor'])[['cell_id']].count()

Unnamed: 0_level_0,cell_id
donor,Unnamed: 1_level_1
13176,43817
27678,32952
31800,43989
32606,41119


In [14]:
metadata_multiome.groupby(['day', 'cell_type'])[['cell_id']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,cell_id
day,cell_type,Unnamed: 2_level_1
2,BP,262
2,EryP,3402
2,HSC,10757
2,MasP,2175
2,MkP,3394
2,MoP,258
2,NeuP,3663
2,hidden,8921
3,BP,121
3,EryP,4699


## Multiome
### test_multi_inputs.h5
### train_multi_inputs.h5
### train_multi_targets.h5

In [13]:
train_multi_5 = pd.read_hdf(f'{DATA_DIR}/train_multi_inputs.h5',
                            stop=5)
print(train_multi_5.shape)
train_multi_5

(5, 228942)


gene_id,GL000194.1:114519-115365,GL000194.1:55758-56597,GL000194.1:58217-58957,GL000194.1:59535-60431,GL000195.1:119766-120427,GL000195.1:120736-121603,GL000195.1:137437-138345,GL000195.1:15901-16653,GL000195.1:22357-23209,GL000195.1:23751-24619,...,chrY:7722278-7723128,chrY:7723971-7724880,chrY:7729854-7730772,chrY:7731785-7732664,chrY:7810142-7811040,chrY:7814107-7815018,chrY:7818751-7819626,chrY:7836768-7837671,chrY:7869454-7870371,chrY:7873814-7874709
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56390cf1b95e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.428336,0.0,0.0,0.0,0.0
fc0c60183c33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9b4a87e22ad0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81cccad8cd81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15cb3d85c232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
train_multi_targets_5 = pd.read_hdf(f'{DATA_DIR}/train_multi_targets.h5',
                                    stop=5)
print(train_multi_targets_5.shape)
train_multi_targets_5

(5, 23418)


gene_id,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000086827,ENSG00000174442,ENSG00000122952,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56390cf1b95e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.893861,0.0,0.0,0.0,0.0,5.583255,0.0,4.893861
fc0c60183c33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9b4a87e22ad0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.107832,0.0,0.0,0.0,0.0,0.0,0.0,5.107832
81cccad8cd81,0.0,4.507936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.195558,4.507936,0.0,0.0,0.0,0.0,0.0,0.0,5.195558
15cb3d85c232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.531572,0.0,0.0,4.842377,0.0


In [42]:
train_multi_targets_5.columns.shape

(23418,)

In [43]:
len(set(train_multi_targets_5.columns))

23418

## CITEseq
### test_cite_inputs.h5
### train_cite_inputs.h5
### train_cite_targets.h5

In [15]:
train_cite_5 = pd.read_hdf(f'{DATA_DIR}/train_cite_inputs.h5',
                           stop=5)
print(train_cite_5.shape)
train_cite_5

(5, 22050)


gene_id,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.090185,0.0
d02759a80ba2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.039545,0.0,0.0,0.0,0.0,0.0,0.0
c016c6b0efa5,0.0,0.0,0.0,0.0,0.0,3.847321,0.0,3.847321,3.847321,0.0,...,0.0,0.0,3.847321,4.529743,0.0,0.0,0.0,3.847321,3.847321,0.0
ba7f733a4f75,0.0,0.0,0.0,0.0,0.0,0.0,3.436846,3.436846,0.0,0.0,...,3.436846,0.0,4.11378,5.020215,0.0,0.0,0.0,3.436846,4.11378,0.0
fbcf2443ffb2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.196826,0.0,0.0,...,0.0,4.196826,4.196826,4.196826,0.0,0.0,3.51861,4.196826,3.51861,0.0


In [16]:
train_cite_targets_5 = pd.read_hdf(f'{DATA_DIR}/train_cite_targets.h5',
                                   stop=5)
print(train_cite_targets_5.shape)
train_cite_targets_5

(5, 140)


gene_id,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.167804,0.62253,0.106959,0.324989,3.331674,6.426002,1.480766,-0.728392,-0.468851,-0.073285,...,-0.44839,3.220174,-0.533004,0.674956,-0.006187,0.682148,1.398105,0.414292,1.780314,0.54807
d02759a80ba2,0.81897,0.506009,1.078682,6.848758,3.524885,5.279456,4.930438,2.069372,0.333652,-0.468088,...,0.323613,8.407108,0.131301,0.047607,-0.243628,0.547864,1.832587,0.982308,2.736507,2.184063
c016c6b0efa5,-0.356703,-0.422261,-0.824493,1.137495,0.518924,7.221962,-0.375034,1.738071,0.142919,-0.97146,...,1.348692,4.888579,-0.279483,-0.131097,-0.177604,-0.689188,9.013709,-1.182975,3.958148,2.8686
ba7f733a4f75,-1.201507,0.149115,2.022468,6.021595,7.25867,2.792436,21.708519,-0.137913,1.649969,-0.75468,...,1.504426,12.391979,0.511394,0.587863,-0.752638,1.714851,3.893782,1.799661,1.537249,4.407671
fbcf2443ffb2,-0.100404,0.697461,0.625836,-0.298404,1.369898,3.254521,-1.65938,0.643531,0.90271,1.291877,...,0.777023,6.496499,0.279898,-0.84195,-0.869419,0.675092,5.259685,-0.835379,9.631781,1.765445


In [17]:
from sklearn.neighbors import KNeighborsRegressor

In [18]:
stop = 100
train_X = pd.read_hdf(f'{DATA_DIR}/train_cite_inputs.h5',
                      stop=stop)
train_y = pd.read_hdf(f'{DATA_DIR}/train_cite_targets.h5',
                      stop=stop)
test_X = pd.read_hdf(f'{DATA_DIR}/test_cite_inputs.h5',
                     stop=stop)

neigh = KNeighborsRegressor(n_neighbors=9)
neigh.fit(train_X, train_y) 
test_y = pd.DataFrame(neigh.predict(test_X))
test_y.head()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
0,0.602595,0.147015,0.402926,3.66307,4.117172,6.091884,7.268775,0.324506,0.095805,-0.040123,...,0.027835,7.570658,0.151437,0.542508,0.131107,0.18803,3.154746,0.503254,2.026004,1.735024
1,0.893137,0.221735,0.502002,3.16167,4.068516,6.480602,6.271758,0.222567,0.302694,-0.016387,...,0.135401,6.746157,0.267966,0.501037,0.153128,0.200685,2.637025,0.504245,2.139507,1.818086
2,0.327479,0.257544,1.055558,3.879193,3.665216,5.543172,3.791518,0.19392,0.593375,-0.062817,...,0.324022,9.450126,0.972078,0.310853,0.040096,0.507648,5.909796,0.355101,5.368031,2.261901
3,-0.14273,0.17038,0.761028,5.337038,5.050567,6.363215,8.327069,-0.398113,0.518615,-0.123817,...,0.005739,9.392337,0.644449,0.206117,0.000414,0.388694,4.828023,0.489559,2.740404,2.192419
4,-0.097478,0.501916,0.805733,3.734261,3.906002,4.813427,2.497361,-0.019202,0.448496,0.106849,...,-0.156334,8.950379,0.955797,-0.030218,-0.056655,0.392656,4.999793,-0.037914,4.667086,2.218417


## evaluation_ids.csv

In [24]:
evaluation_ids_200 = pd.read_csv(f'{DATA_DIR}/evaluation_ids.csv',
                                 nrows=200)
evaluation_ids_200.head()

Unnamed: 0,row_id,cell_id,gene_id
0,0,c2150f55becb,CD86
1,1,c2150f55becb,CD274
2,2,c2150f55becb,CD270
3,3,c2150f55becb,CD155
4,4,c2150f55becb,CD112


In [25]:
evaluation_ids_200[evaluation_ids_200.cell_id == 'c2150f55becb'].shape

(140, 3)

In [32]:
ps = evaluation_ids_200[evaluation_ids_200.cell_id == 'c2150f55becb'].gene_id.values

In [28]:
eval_ids = pd.read_csv(f'{DATA_DIR}/evaluation_ids.csv')
len(eval_ids.gene_id.unique())

23558

In [30]:
eval_ids.head()

Unnamed: 0,row_id,cell_id,gene_id
0,0,c2150f55becb,CD86
1,1,c2150f55becb,CD274
2,2,c2150f55becb,CD270
3,3,c2150f55becb,CD155
4,4,c2150f55becb,CD112


In [29]:
eval_ids.tail()

Unnamed: 0,row_id,cell_id,gene_id
65744175,65744175,2c53aa67933d,ENSG00000134419
65744176,65744176,2c53aa67933d,ENSG00000186862
65744177,65744177,2c53aa67933d,ENSG00000170959
65744178,65744178,2c53aa67933d,ENSG00000107874
65744179,65744179,2c53aa67933d,ENSG00000166012


In [33]:
multi = eval_ids[~eval_ids['gene_id'].isin(ps)]
multi.groupby(['cell_id'])[['gene_id']].count().head()

Unnamed: 0_level_0,gene_id
cell_id,Unnamed: 1_level_1
00038e5227c6,3512
00074a4b8e79,3512
000d989e046d,3512
00116a3f5ca8,3512
0016247b891d,3512


In [34]:
len(multi.cell_id.unique()) * 3512

58931360

In [36]:
len(eval_ids)

65744180

In [37]:
len(eval_ids[eval_ids['gene_id'].isin(ps)].cell_id.unique()) * 140

6812820

In [38]:
6812820 + 58931360

65744180

In [39]:
58931360 / 3512

16780.0

In [40]:
6812820 / 140

48663.0

## submission.csv

In [26]:
sample_sub_200 = pd.read_csv(f'{DATA_DIR}/sample_submission.csv',
                             nrows=200)
sample_sub_200.head()

Unnamed: 0,row_id,target
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
