1. Get sample data with 212 columns (128 svd + 84 handselected genes)
- get 5/50/n samples per cell type: get_samples()
- save resulting df: X_test_shap_....h5ad
- prepare private test data:
 - svd
 - also apply get_samples()
 - save private_test_input_sample.h5ad

 2. Same steps for sample data with 148 columns (64 svd + 84 handselected genes)

In [1]:
%%capture output
!pip install ipywidgets
!pip install --upgrade jupyter
!pip install IProgress
!pip install anndata

In [2]:
import numpy as np
import pandas as pd
import os

import pickle

import scanpy as sc
import anndata as ad
import scipy

## create samples representing each cell type for public test data - model #16

### add cell_ids and cell_type to train and test data

cell_type is contained in the metadata. That's why the test data will be annotated with the metadata. Afterwards, it is possible to sample n cells from each cell_type.

In [3]:
lrz_path = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di93zoj/'
index_path = lrz_path + 'open-problems-multimodal-3rd-solution/input/preprocess/cite/'
cite_feature_path = lrz_path + 'open-problems-multimodal-3rd-solution/input/features/cite/'   # '../../../input/features/cite/'

private_data_path = lrz_path + 'kaggle/full_data'

In [4]:
# train_ids = np.load(index_path + "train_cite_raw_inputs_idxcol.npz", allow_pickle=True)
# test_ids = np.load(index_path + "test_cite_raw_inputs_idxcol.npz", allow_pickle=True)

# load saved indices
train_index = np.load(index_path + "train_cite_raw_inputs_idxcol.npz", allow_pickle=True)["index"]
# train_column = np.load(index_path + "train_cite_raw_inputs_idxcol.npz", allow_pickle=True)["columns"]  # not used
test_index = np.load(index_path + "test_cite_raw_inputs_idxcol.npz", allow_pickle=True)["index"]
print(len(list(train_index)))
print(len(list(test_index)))

70988
48203


In [5]:
# load metadata and filter for cell_id present in test_index
metadata = pd.read_csv(lrz_path + 'neurips_competition_data/metadata.csv')
metadata_filtered = metadata[metadata['cell_id'].isin(test_index)]
metadata_filtered.index=metadata_filtered['cell_id']
metadata_filtered = metadata_filtered.drop('cell_id', axis=1)
metadata_filtered.shape   # somehow only 41187 matching cell_ids instead of 48203

(41187, 4)

In [6]:
# X_train_cell_ids and X_test_cell_ids are the train and test sets corresponding to model #16, annotated with cell_ids
# X_test_cell_ids is additionally annotated with metadata which stores the cell type

X_train_cell_ids = pd.read_pickle(cite_feature_path  + 'X_svd_128.pickle')   # == X_svd_128 in make-features second to last cell == train set for model #16
X_train_cell_ids = ad.AnnData(X=X_train_cell_ids)
X_train_cell_ids.obs_names = train_index
# X_train_cell_ids.to_df().head()

# cell type from metadata
X_test_cell_ids = pd.read_pickle(cite_feature_path  + 'X_test_svd_128.pickle')   # == test set for model #16
X_test_cell_ids = ad.AnnData(X=X_test_cell_ids)
X_test_cell_ids.obs_names = test_index
X_test_cell_ids = X_test_cell_ids[X_test_cell_ids.obs_names.isin(metadata_filtered.index), :]
X_test_cell_ids.obs = metadata_filtered
X_test_cell_ids



AnnData object with n_obs × n_vars = 41187 × 212
    obs: 'day', 'donor', 'cell_type', 'technology'

In [7]:
X_test_cell_ids.obs['cell_type'].value_counts()

cell_type
HSC     9451
MasP    9064
EryP    8788
NeuP    7719
MkP     4844
MoP     1215
BP       106
Name: count, dtype: int64

### create dataset of n samples per cell type for SHAP beeswarm plot

In [8]:
os.chdir('../..')

In [9]:
# get_samples() returns a subset of the data which contains samples_per_cell_type samples per cell type
def get_samples(samples_per_cell_type, data):
    ''' data: AnnData with obs 'cell_type' '''
    
    unique_types = np.unique(data.obs['cell_type'])

    sample_obs = []

    # get indices of n samples per cell type
    for t in unique_types:
        sampled_rows = data[data.obs.cell_type == t].obs.sample(n=samples_per_cell_type, random_state=42)
        sample_obs.append(sampled_rows.index) 

    # select rows in data with matching indices
    X_test_shap = data[[elem for index_obj in sample_obs for elem in index_obj.tolist()]]

    return X_test_shap

X_test_shap = get_samples(50, X_test_cell_ids)

In [15]:
# rename imp_ columns to gene ids stored in handselected_84_gene_ids
handselected_84_gene_ids = np.loadtxt('2.preprocess_to_feature/cite/handselected_84_gene_ids.txt', dtype=str)

In [16]:
# X_test_shap currently has base_svd and imp columns -> rename imp (handselected genes)
X_test_shap.var_names = X_test_shap.var_names[:-84].tolist() + handselected_84_gene_ids.tolist()
print(X_test_shap.shape)
X_test_shap.to_df().head(1)

(350, 212)


Unnamed: 0_level_0,base_svd_0,base_svd_1,base_svd_2,base_svd_3,base_svd_4,base_svd_5,base_svd_6,base_svd_7,base_svd_8,base_svd_9,...,ENSG00000188404_SELL,ENSG00000124570_SERPINB6,ENSG00000235169_SMIM1,ENSG00000095932_SMIM24,ENSG00000137642_SORL1,ENSG00000128040_SPINK2,ENSG00000072274_TFRC,ENSG00000205542_TMSB4X,ENSG00000133112_TPT1,ENSG00000026025_VIM
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5cb9daaca7ac,71.003983,13.2008,-6.168917,12.436371,-0.578827,0.227373,1.382157,0.969637,-1.477851,6.248293,...,0.0,0.0,0.0,0.0,0.0,0.947536,0.0,5.313975,4.545222,3.069516


In [17]:
X_test_shap.write("4.model/pred/X_test_shap_16_50_samples.h5ad")

## create samples representing each cell type for public test data - model #17

In [18]:
# X_train_cell_ids and X_test_cell_ids are the train and test sets corresponding to model #17, annotated with cell_ids
# X_test_cell_ids is additionally annotated with metadata which stores the cell type

X_train_cell_ids = pd.read_pickle(cite_feature_path  + 'X_svd_64.pickle')   # == X_svd_64 in make-features second to last cell == train set for model #17
X_train_cell_ids = ad.AnnData(X=X_train_cell_ids)
X_train_cell_ids.obs_names = train_index

# cell type from metadata
X_test_cell_ids = pd.read_pickle(cite_feature_path  + 'X_test_svd_64.pickle')   # == test set for model #17
X_test_cell_ids = ad.AnnData(X=X_test_cell_ids)
X_test_cell_ids.obs_names = test_index
X_test_cell_ids = X_test_cell_ids[X_test_cell_ids.obs_names.isin(metadata_filtered.index), :]
X_test_cell_ids.obs = metadata_filtered
X_test_cell_ids



AnnData object with n_obs × n_vars = 41187 × 148
    obs: 'day', 'donor', 'cell_type', 'technology'

In [19]:
# use same sampled rows from model 16 (X_test_shap) for model 17:
X_test_shap_17 = X_test_cell_ids[X_test_shap.obs_names]   

# rename imp_ columns to gene ids stored in handselected_84_gene_ids
X_test_shap_17.var_names = X_test_shap_17.var_names[:-84].tolist() + handselected_84_gene_ids.tolist()

In [20]:
X_test_shap_17.write("4.model/pred/X_test_shap_17_50_samples.h5ad")

## create samples representing each cell type for private test data
In this case, also need to apply SVD transform.\
First: 128 svd components -> use resulting sample dataset together with model #16.

In [10]:
metadata = pd.read_pickle(private_data_path + '/20220830_citeseq_metadata_cells.pkl')
metadata.shape

(119191, 4)

In [11]:
private_train_input = pd.read_pickle(private_data_path + '/20220830_citeseq_rna_count_train.pkl')
private_train_input.shape

(70988, 22085)

In [12]:
private_test_input = pd.read_pickle(private_data_path + '/20220830_citeseq_rna_count_test_input_private.pkl') # already has gene names (ordered alphabetically)
private_test_input.shape

(26867, 22085)

In [13]:
private_test_target = pd.read_pickle(private_data_path + '/20220830_citeseq_prot_count_test_labels_private.pkl')
private_test_target.shape

(26867, 140)

In [14]:
# filter metadata to get matching cells  -> then use as observation to annotate data with cell_type
metadata_filtered = metadata[metadata.index.isin(private_test_input.index)]
metadata_filtered.head()

Unnamed: 0,kaggle_dataset,day,donor,cell_type
"CATAAGCAGCATGATA-1-('27678', 7)",test_private,7,27678,MasP
"GGTGATTGTATGAAAC-1-('27678', 7)",test_private,7,27678,HSC
"ATAGGCTAGCTCTGTA-1-('27678', 7)",test_private,7,27678,MasP
"TCATGAGGTGGATTTC-1-('27678', 7)",test_private,7,27678,MasP
"TAAGCGTTCGGCTGGT-1-('27678', 7)",test_private,7,27678,HSC


In [15]:
private_test_input = ad.AnnData(private_test_input, obs=metadata_filtered)
private_test_target = ad.AnnData(private_test_target, obs=metadata_filtered)
private_test_input

AnnData object with n_obs × n_vars = 26867 × 22085
    obs: 'kaggle_dataset', 'day', 'donor', 'cell_type'

In [27]:
private_test_input.write(lrz_path + "/large_preprocessed_files/private_test_input.h5ad")
private_test_target.write(lrz_path + "/large_preprocessed_files/private_test_target.h5ad")

### preprocess input data - similar to make-base-feature.py from codebase

In [23]:
# all_genes = np.loadtxt('2.preprocess_to_feature/cite/all_genes_names.txt', dtype=str)  # not used
# all_genes   # 22001 genes sorted alphabetically

In [16]:
# indices of 84 handselected genes
use_imp_cols_84 = np.loadtxt('2.preprocess_to_feature/cite/use_imp_cols_84.txt')
use_imp_cols_84

array([ 3290.,  4010.,  4743.,  4841.,  5172.,  5260.,  5619.,  5632.,
        5939.,  6120.,  6699.,  6704.,  6705.,  6710.,  6713.,  6714.,
        6716.,  6720.,  6733.,  6734.,  6735.,  6737.,  6741.,  6744.,
        6749.,  6750.,  6756.,  6759.,  6761.,  6762.,  6763.,  6768.,
        6771.,  6774.,  6777.,  7292.,  7338.,  7616.,  7688.,  7772.,
        8238.,  8747.,  9331.,  9602.,  9908.,  9948., 10500., 10588.,
       10589., 10602., 10603., 10606., 10732., 10904., 10937., 10956.,
       11309., 11398., 11597., 11662., 13011., 13390., 13801., 14785.,
       15124., 15266., 15571., 16503., 17087., 17138., 17323., 17339.,
       17512., 17514., 17750., 17826., 18436., 18449., 18660., 18762.,
       19495., 19923., 20095., 20883.])

In [17]:
# indices of other 22001 genes
use_cols_84 = np.loadtxt('2.preprocess_to_feature/cite/use_cols_84.txt')
len(use_cols_84)

22001

In [18]:
# combine private train and test data to apply svd jointly
all_inputs = scipy.sparse.vstack([private_train_input.to_numpy(), private_test_input.X]).tocsr()
all_inputs.shape

(97855, 22085)

In [40]:
# normalize, log, svd (fitted in codebase) applied on 22001 columns of the data

df_use = all_inputs[:, use_cols_84]
# df_use = private_test_input_raw[:, use_cols_84]  # _raw
#print(df_use.X)

sc.pp.normalize_per_cell(df_use, copy=True)
sc.pp.log1p(df_use)

with open('2.preprocess_to_feature/cite/svd_model_fitted_128.pkl', 'rb') as file:
    svd = pickle.load(file)
    
result_svd = svd.transform(df_use)

train_cite_pp = pd.DataFrame(result_svd[:private_train_input.shape[0]], index = private_train_input.index)   # has cell_ids
test_cite_pp = pd.DataFrame(result_svd[private_train_input.shape[0]:], index = private_test_input.obs_names)     # has cell_ids   # _raw

# important protein cols ---------------------------------------------------------------------
# normalize, log applied on 84 handselected columns of the data
df_important = all_inputs[:, use_imp_cols_84]   # _raw
df_important = sc.AnnData(X = df_important)
sc.pp.normalize_per_cell(df_important)
sc.pp.log1p(df_important)

train_imp = pd.DataFrame(df_important.X[:private_train_input.shape[0]].toarray(), index = private_train_input.index).add_prefix('imp_')
test_imp = pd.DataFrame(df_important.X[private_train_input.shape[0]:].toarray(), index = private_test_input.obs_names).add_prefix('imp_')   # _raw

In [41]:
train_cite_pp = train_cite_pp.add_prefix('base_svd_')
test_cite_pp = test_cite_pp.add_prefix('base_svd_')
test_cite_pp.shape

(26867, 128)

In [42]:
test_imp.shape

(26867, 84)

### preprocess input data - similar to make-features.py from codebase

In [43]:
# concatenate all 128+84 columns
X_train_svd_128 = pd.concat([train_cite_pp.reset_index(drop=True), 
                            train_imp.reset_index(drop=True),
                           ], axis = 1)

X_test_svd_128 = pd.concat([test_cite_pp.reset_index(drop=True), 
                            test_imp.reset_index(drop=True),
                           ], axis = 1)
X_test_svd_128.head(3)   # final X_test

Unnamed: 0,base_svd_0,base_svd_1,base_svd_2,base_svd_3,base_svd_4,base_svd_5,base_svd_6,base_svd_7,base_svd_8,base_svd_9,...,imp_74,imp_75,imp_76,imp_77,imp_78,imp_79,imp_80,imp_81,imp_82,imp_83
0,78.589668,8.756832,29.644707,-4.698128,-2.073666,-9.323292,-6.305599,2.682963,-2.61493,5.461427,...,1.715979,1.715979,0.0,0.0,1.590178,0.0,1.715979,2.275457,2.260236,2.199701
1,81.775703,-4.406732,9.736597,-12.13558,5.873678,-10.958516,3.955266,-4.741723,1.645253,4.144031,...,1.672662,1.851859,0.0,1.851859,1.82418,1.82418,1.541757,2.059803,2.233138,2.059803
2,49.658005,19.822294,13.08451,-3.894362,5.864494,-2.685749,-1.049638,-6.572547,-5.336941,1.771537,...,2.032617,2.032617,0.0,0.0,0.0,0.0,0.0,2.402486,2.50879,2.355022


In [45]:
# save prepared private X_train
with open('4.model/pred/private_X_train_svd_128.pkl', 'wb') as f:   # _raw
    pickle.dump(X_train_svd_128, f)

In [46]:
# save prepared private X_test 
with open('4.model/pred/private_X_test_svd_128.pkl', 'wb') as f:   # _raw
    pickle.dump(X_test_svd_128, f)

### sample equally distributed cell types in train data

In [37]:
# sample same cell type distribution from X_train to compute median on and use for shap explainer
metadata_filtered = metadata[metadata.index.isin(private_train_input.index)]
private_train_input = ad.AnnData(private_train_input, obs=metadata_filtered)
private_train_input.obs['cell_type'].value_counts()

cell_type
HSC     29879
EryP    14241
NeuP    12493
MasP     8242
MkP      5382
MoP       591
BP        160
Name: count, dtype: int64

In [38]:
# Create numbered ID column
num_ids = list(range(0, len(private_train_input.obs)))
# Add the numbered ID column to the AnnData object
private_train_input.obs['ID'] = num_ids

In [39]:
# 160 = highest number of rows per cell type possible to have uniform distribution
private_train_input_sample_max = get_samples(160, private_train_input)
private_train_input_sample_max

View of AnnData object with n_obs × n_vars = 1120 × 22085
    obs: 'kaggle_dataset', 'day', 'donor', 'cell_type', 'ID'

In [40]:
private_train_input_sample_max.write("4.model/pred/private_train_input_max_samples.h5ad")

  df[key] = c
  df[key] = c
  df[key] = c


### get sample per cell type for SHAP

In [47]:
X_test_svd_128_ann = ad.AnnData(X=X_test_svd_128, obs=metadata_filtered)
X_test_svd_128_ann

AnnData object with n_obs × n_vars = 26867 × 212
    obs: 'kaggle_dataset', 'day', 'donor', 'cell_type'

In [65]:
private_test_input_sample = get_samples(50, X_test_svd_128_ann)

In [66]:
private_test_input_sample.obs

Unnamed: 0,kaggle_dataset,day,donor,cell_type
"CTCCCTCGTATGAAAC-1-('32606', 7)",test_private,7,32606,BP
"CATGCAATCGAAATCC-1-('27678', 7)",test_private,7,27678,BP
"AAGCGAGGTTCATCGA-1-('31800', 7)",test_private,7,31800,BP
"TTCGCTGAGAAAGTCT-1-('27678', 7)",test_private,7,27678,BP
"TCCAGAACATCTCATT-1-('31800', 7)",test_private,7,31800,BP
...,...,...,...,...
"CAGAGCCGTACCGGAA-1-('32606', 7)",test_private,7,32606,NeuP
"TTGGGTAGTTACCTTT-1-('31800', 7)",test_private,7,31800,NeuP
"AGACAGGCATGCTGCG-1-('13176', 7)",test_private,7,13176,NeuP
"CGAGGAATCGCGAAGA-1-('32606', 7)",test_private,7,32606,NeuP


In [67]:
private_test_input_sample.write("4.model/pred/private_test_input_128_svd_50_samples.h5ad")

  df[key] = c
  df[key] = c
  df[key] = c


## sample of whole private_test_input

In [19]:
private_test_input_sample = get_samples(50, private_test_input)
private_test_input_sample

View of AnnData object with n_obs × n_vars = 350 × 22085
    obs: 'kaggle_dataset', 'day', 'donor', 'cell_type'

In [20]:
os.chdir('open-problems-multimodal-3rd-solution/code')

In [21]:
private_test_input_sample.write("4.model/pred/private_test_input_50_samples.h5ad")

  df[key] = c
  df[key] = c
  df[key] = c


## create samples representing each cell type for private test data
Again, need to apply SVD transform.\
Now: 64 svd components -> use resulting sample dataset together with model #17.

### preprocess input data - similar to make-base-feature.py

In [50]:
# normalize, log, svd (fitted in codebase) applied on 22001 columns of the data
df_use = all_inputs[:, use_cols_84]
# df_use = private_test_input_raw[:, use_cols_84]  # _raw
#print(df_use.X)

sc.pp.normalize_per_cell(df_use, copy=True)
sc.pp.log1p(df_use)

with open('2.preprocess_to_feature/cite/svd_model_fitted_64.pkl', 'rb') as file:
    svd = pickle.load(file)
    
result_svd = svd.transform(df_use)

train_cite_pp = pd.DataFrame(result_svd[:private_train_input.shape[0]], index = private_train_input.index)   # has cell_ids
test_cite_pp = pd.DataFrame(result_svd[private_train_input.shape[0]:], index = private_test_input.obs_names)     # has cell_ids   # _raw

# important protein cols ---------------------------------------------------------------------
# normalize, log applied on 84 handselected columns of the data
df_important = all_inputs[:, use_imp_cols_84]   # _raw
df_important = sc.AnnData(X = df_important)
sc.pp.normalize_per_cell(df_important)
sc.pp.log1p(df_important)

train_imp = pd.DataFrame(df_important.X[:private_train_input.shape[0]].toarray(), index = private_train_input.index).add_prefix('imp_')
test_imp = pd.DataFrame(df_important.X[private_train_input.shape[0]:].toarray(), index = private_test_input.obs_names).add_prefix('imp_')   # _raw

In [51]:
train_cite_pp = train_cite_pp.add_prefix('base_svd_')
test_cite_pp = test_cite_pp.add_prefix('base_svd_')
test_cite_pp.shape

(26867, 64)

In [52]:
test_imp.shape

(26867, 84)

### preprocess input data - similar to make-features.py

In [19]:
# concatenate all 128+84 columns
X_train_svd_64 = pd.concat([train_cite_pp.reset_index(drop=True), 
                            train_imp.reset_index(drop=True),
                           ], axis = 1)

X_test_svd_64 = pd.concat([test_cite_pp.reset_index(drop=True), 
                            test_imp.reset_index(drop=True),
                           ], axis = 1)
X_test_svd_64.head(3)   # final X_test

NameError: name 'train_cite_pp' is not defined

In [54]:
# save prepared private X_test 
with open('4.model/pred/private_X_train_svd_64.pkl', 'wb') as f:   # _raw
    pickle.dump(X_train_svd_64, f)

In [55]:
# save prepared private X_test 
with open('4.model/pred/private_X_test_svd_64.pkl', 'wb') as f:   # _raw
    pickle.dump(X_test_svd_64, f)

### get sample per cell type for SHAP

In [21]:
X_test_svd_64 = pd.read_pickle('4.model/pred/private_X_test_svd_64.pkl')

In [22]:
X_test_svd_64_ann = ad.AnnData(X=X_test_svd_64, obs=metadata_filtered)
X_test_svd_64_ann

AnnData object with n_obs × n_vars = 26867 × 148
    obs: 'kaggle_dataset', 'day', 'donor', 'cell_type'

In [29]:
private_test_input_sample = ad.read_h5ad('4.model/pred/private_test_input_128_svd_5_samples.h5ad')

In [30]:
private_test_input_sample_64 = X_test_svd_64_ann[private_test_input_sample.obs_names]      # use same sampled rows from model 16 (private_test_input_sample) for model 17
private_test_input_sample_64

View of AnnData object with n_obs × n_vars = 35 × 148
    obs: 'kaggle_dataset', 'day', 'donor', 'cell_type'

In [31]:
private_test_input_sample_64.write("4.model/pred/private_test_input_64_svd_5_samples.h5ad")

  df[key] = c
  df[key] = c
  df[key] = c
