In [3]:
import os
os.chdir('/home/yz979/code/kaggle-perturbation')
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import anndata as ad
import scanpy as sc
import scipy

## Loading expression data

Here we load expression data (long format) and converting it into an AnnData object (wide sparse format).

You'll need to increase your instance RAM to at least 64 GB.

In [19]:
data_dir = 'data'
adata_train_df = pd.read_parquet(os.path.join(data_dir, 'adata_train.parquet'))
adata_obs_meta_df = pd.read_csv(os.path.join(data_dir, 'adata_obs_meta.csv'))

adata_train_df['obs_id'] = adata_train_df['obs_id'].astype('category')
adata_train_df['gene'] = adata_train_df['gene'].astype('category')

obs_ids = adata_train_df['obs_id'].unique()
obs_id_map = dict(zip(obs_ids, range(len(obs_ids))))

genes = adata_train_df['gene'].unique()
gene_map = dict(zip(genes, range(len(genes))))

adata_train_df['obs_index'] = adata_train_df['obs_id'].map(obs_id_map)
adata_train_df['gene_index'] = adata_train_df['gene'].map(gene_map)

normalized_counts_values = adata_train_df['normalized_count'].to_numpy()
counts_values = adata_train_df['count'].to_numpy()

row_indices = adata_train_df['obs_index'].to_numpy()
col_indices = adata_train_df['gene_index'].to_numpy()

counts = scipy.sparse.csr_matrix((counts_values, (row_indices, col_indices)))

obs_df = pd.Series(obs_ids, name='obs_id').to_frame()
var_df = pd.Series(genes, name='gene').to_frame()

obs_df = obs_df.set_index('obs_id')
var_df = var_df.set_index('gene')

obs_df.index = obs_df.index.astype('str')
var_df.index = var_df.index.astype('str')

counts_adata = ad.AnnData(
    X=counts,
    obs=obs_df,
    var=var_df,
    dtype=np.uint32,
)

index_ordering_before_join = counts_adata.obs.index
counts_adata.obs = counts_adata.obs.join(adata_obs_meta_df.set_index('obs_id'))
index_ordering_after_join = counts_adata.obs.index
assert (index_ordering_before_join == index_ordering_after_join).all()

counts_adata.write_h5ad(os.path.join(data_dir, 'adata_train.h5ad'))

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c


## Loading multi-omics data

Here we load multi-omics data (long format) and converting it into an AnnData object (wide sparse format).

You'll need to increase your instance RAM to at least 64 GB.

In [16]:
data_dir = 'data'
multiome_train_df = pd.read_parquet(os.path.join(data_dir, 'multiome_train.parquet'))
multiome_obs_meta_df = pd.read_csv(os.path.join(data_dir, 'multiome_obs_meta.csv'))
multiome_var_meta_df = pd.read_csv(os.path.join(data_dir, 'multiome_var_meta.csv'))

multiome_train_df['obs_id'] = multiome_train_df['obs_id'].astype('category')
multiome_train_df['location'] = multiome_train_df['location'].astype('category')

obs_ids = multiome_train_df['obs_id'].unique()
obs_id_map = dict(zip(obs_ids, range(len(obs_ids))))

locations = multiome_train_df['location'].unique()
location_map = dict(zip(locations, range(len(locations))))

multiome_train_df['obs_index'] = multiome_train_df['obs_id'].map(obs_id_map)
multiome_train_df['location_index'] = multiome_train_df['location'].map(location_map)

normalized_counts_values = multiome_train_df['normalized_count'].to_numpy()
counts_values = multiome_train_df['count'].to_numpy()

row_indices = multiome_train_df['obs_index'].to_numpy()
col_indices = multiome_train_df['location_index'].to_numpy()

counts = scipy.sparse.csr_matrix((counts_values, (row_indices, col_indices)))

obs_df = pd.Series(obs_ids, name='obs_id').to_frame()
var_df = pd.Series(locations, name='location').to_frame()

obs_df = obs_df.set_index('obs_id')
var_df = var_df.set_index('location')

obs_df.index = obs_df.index.astype('str')
var_df.index = var_df.index.astype('str')

counts_adata = ad.AnnData(
    X=counts,
    obs=obs_df,
    var=var_df,
    dtype=np.uint32,
)

index_ordering_before_join = counts_adata.obs.index
counts_adata.obs = counts_adata.obs.join(multiome_obs_meta_df.set_index('obs_id'))
index_ordering_after_join = counts_adata.obs.index
assert (index_ordering_before_join == index_ordering_after_join).all()

index_ordering_before_join = counts_adata.var.index
counts_adata.var = counts_adata.var.join(multiome_var_meta_df.set_index('location'))
index_ordering_after_join = counts_adata.var.index
assert (index_ordering_before_join == index_ordering_after_join).all()

rna_adata = counts_adata.copy()[:, counts_adata.var['feature_type'] == 'Gene Expression']
atac_adata = counts_adata.copy()[:, counts_adata.var['feature_type'] == 'Peaks']
counts_adata = ad.concat([rna_adata, atac_adata], axis=1, merge="same")

counts_adata.write_h5ad(os.path.join(data_dir, 'multiome_train.h5ad'))




## Loading differential expression data

Here we load differential expression data (long format) and converting it into an AnnData object (wide sparse format).

In [None]:
data_dir = 'data'
de_train_df = pd.read_parquet(os.path.join(data_dir, 'de_train.parquet'))

de_train_obs_meta_df = de_train_df.iloc[:, :5]
de_train_df = de_train_df.iloc[:, 5:]