In [1]:
import scanpy as sc
import pandas as pd
import pickle
import numpy as np
from os.path import join
import dask.dataframe as dd
import os
from self_supervision.paths import MULTIMODAL_FOLDER, DATA_DIR

# Only check adata

### Create an adata with the genes available in the large dataset

In [2]:
adata = sc.read_h5ad(os.path.join(MULTIMODAL_FOLDER, 'GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad'))

In [3]:
adata

AnnData object with n_obs × n_vars = 90261 × 14087
    obs: 'GEX_n_genes_by_counts', 'GEX_pct_counts_mt', 'GEX_size_factors', 'GEX_phase', 'ADT_n_antibodies_by_counts', 'ADT_total_counts', 'ADT_iso_count', 'cell_type', 'batch', 'ADT_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', 'is_train'
    var: 'feature_types', 'gene_id'
    uns: 'dataset_id', 'genome', 'organism'
    obsm: 'ADT_X_pca', 'ADT_X_umap', 'ADT_isotype_controls', 'GEX_X_pca', 'GEX_X_umap'
    layers: 'counts'

In [3]:
adata.var['gene_id']

AL627309.5    ENSG00000241860
LINC01409     ENSG00000237491
LINC01128     ENSG00000228794
LINC00115     ENSG00000225880
FAM41C        ENSG00000230368
                   ...       
HLA-E         ENSG00000204592
CD82          ENSG00000085117
CD101         ENSG00000134256
CD88                      NaN
CD224                     NaN
Name: gene_id, Length: 14087, dtype: category
Categories (13953, object): ['ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938', ..., 'ENSG00000288253', 'ENSG00000288302', 'ENSG00000288380', 'ENSG00000288398']

### Genes in cell x gene store

In [2]:
STORE_PATH = os.path.join(DATA_DIR, 'merlin_cxg_2023_05_15_sf-log1p')
var = list(pd.read_parquet(join(STORE_PATH, 'var.parquet'))['feature_id'])

### Cell types in cell x gene store

In [8]:
cell_type_mapping = pd.read_parquet(os.path.join(STORE_PATH, 'categorical_lookup/cell_type.parquet'))
y_true = dd.read_parquet(join(STORE_PATH, 'test'), columns='cell_type').compute().to_numpy()
y_true

array([140, 129,  14, ...,  19, 129, 160])

In [9]:
cell_type_mapping

Unnamed: 0,label
0,B cell
1,Bergmann glial cell
2,"CD14-low, CD16-positive monocyte"
3,CD14-positive monocyte
4,"CD14-positive, CD16-negative classical monocyte"
...,...
159,type I pneumocyte
160,type II pneumocyte
161,vascular associated smooth muscle cell
162,vein endothelial cell


In [11]:
# Step 1: Convert the Pandas DataFrame to a dictionary
cell_type_mapping_dict = cell_type_mapping['label'].to_dict()

# Step 2: Map each integer in y_true to its string representation
y_true_strings = [cell_type_mapping_dict.get(i, 'Unknown') for i in y_true]

# Step 3: Find unique string values
unique_strings = list(set(y_true_strings))
unique_strings

['CD4-positive, alpha-beta memory T cell',
 'ciliated columnar cell of tracheobronchial tree',
 'endothelial cell of lymphatic vessel',
 'CD8-positive, alpha-beta cytotoxic T cell',
 'sncg GABAergic cortical interneuron',
 'erythroblast',
 'alveolar macrophage',
 'lung pericyte',
 'alternatively activated macrophage',
 'effector memory CD4-positive, alpha-beta T cell',
 'hematopoietic stem cell',
 'microglial cell',
 'transitional stage B cell',
 'keratinocyte',
 'alveolar type 1 fibroblast cell',
 'plasma cell',
 'respiratory hillock cell',
 'precursor B cell',
 'neuron',
 'endothelial cell',
 'memory T cell',
 'mucosal invariant T cell',
 'glutamatergic neuron',
 'Schwann cell',
 'pvalb GABAergic cortical interneuron',
 'dendritic cell',
 'effector memory CD8-positive, alpha-beta T cell',
 'nasal mucosa goblet cell',
 'plasmacytoid dendritic cell',
 'luminal epithelial cell of mammary gland',
 'lamp5 GABAergic cortical interneuron',
 'CD8-alpha-alpha-positive, alpha-beta intraepithel

### Filter NeurIPS adata

In [5]:
# Store protein counts seperately
df = pd.DataFrame(adata.layers["counts"].todense()[:,-134:])  # 134 can be found from adata.vars['feature_types']
df.columns = adata.var["feature_types"][-134:].index
df.index = adata.obs.index
adata.obsm["protein_counts"] = df

In [6]:
# This filtering step also filters out the protein counts, so only GEX left

filtered_genes = [gene for gene in adata.var['gene_id'].values if gene in var]

# Filter the AnnData object
adata_filtered = adata[:, adata.var['gene_id'].isin(filtered_genes)]

# Save the filtered AnnData object
adata_filtered.write(os.path.join(MULTIMODAL_FOLDER, 'NeurIPS_filtered_adata.h5ad'))

### Genes in Multiomics adata

In [7]:
adata = sc.read_h5ad(os.path.join(MULTIMODAL_FOLDER, 'NeurIPS_filtered_adata.h5ad'))

In [8]:
unique_batches = adata.obs['batch'].unique()

# Randomly choose one batch for out-of-distribution (OOD) testing
ood_batch = np.random.choice(unique_batches)
print(f"Batch chosen for OOD testing: {ood_batch}")

# Identify indices for OOD
ood_indices = adata.obs['batch'] == ood_batch

# Identify indices for in-distribution batches
ind_indices = ~ood_indices

# Create 'split' column in adata_filtered.obs
adata_filtered.obs['split'] = 'unassigned'

# Assign OOD test set
adata_filtered.obs.loc[ood_indices, 'split'] = 'ood_test'

# Generate random split (80/10/10) for train/val/test for in-distribution batches
random_assignments = np.random.choice(
    ['train', 'val', 'test'],
    size=ind_indices.sum(),
    p=[0.8, 0.1, 0.1]
)

# Assign train/val/test sets
adata_filtered.obs.loc[ind_indices, 'split'] = random_assignments

# Ensure the split assignment is stored back in the original AnnData object
adata_filtered.obs['split'].astype(pd.CategoricalDtype())
adata_filtered.write(os.path.join(MULTIMODAL_FOLDER, 'NeurIPS_filtered_adata.h5ad'))

Batch chosen for OOD testing: s2d1


In [9]:
multiomics_gene_names = list(adata.var['gene_id'])
len(multiomics_gene_names)

11986

### HVG Selection

In [10]:
# General preprocessing
sc.pp.normalize_total(adata_filtered, target_sum=1e4)

sc.pp.log1p(adata_filtered)


In [11]:
sc.pp.highly_variable_genes(adata_filtered, n_top_genes=2000)

# Filter the data to only include highly variable genes
adata_hvg = adata_filtered[:, adata_filtered.var['highly_variable']]

# Save the filtered AnnData object with highly variable genes
adata_hvg.write(os.path.join(MULTIMODAL_FOLDER, 'NeurIPS_filtered_hvg_adata.h5ad'))

In [12]:
multiomics_gene_names = list(adata_hvg.var['gene_id'])
len(multiomics_gene_names)

2000

### Indices list

In [13]:
def find_indices(a, b):
    """
    Finds the indices of the elements of b in a.

    Args:
    a: The list of strings.
    b: The list of strings to find.

    Returns:
    A list of integers, where each integer is the index of the corresponding
    element of b in a.
    """
    c = []
    for i in range(len(b)):
        j = 0
        while j < len(a):
            if a[j] == b[i]:
                c.append(j)
                break
            j += 1
    
    return c

In [14]:
multiomics_indices = find_indices(var, multiomics_gene_names)

In [15]:
len(multiomics_indices)

2000

### Store indices list to disk

In [16]:
with open('multiomics_indices.pickle', 'wb') as f:
    pickle.dump(list(multiomics_indices), f)

### Check adata

In [17]:
adata = sc.read_h5ad(os.path.join(MULTIMODAL_FOLDER, 'NeurIPS_filtered_hvg_adata.h5ad'))

In [18]:
adata

AnnData object with n_obs × n_vars = 90261 × 2000
    obs: 'GEX_n_genes_by_counts', 'GEX_pct_counts_mt', 'GEX_size_factors', 'GEX_phase', 'ADT_n_antibodies_by_counts', 'ADT_total_counts', 'ADT_iso_count', 'cell_type', 'batch', 'ADT_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', 'is_train', 'split'
    var: 'feature_types', 'gene_id', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'dataset_id', 'genome', 'hvg', 'log1p', 'organism'
    obsm: 'ADT_X_pca', 'ADT_X_umap', 'ADT_isotype_controls', 'GEX_X_pca', 'GEX_X_umap', 'protein_counts'
    layers: 'counts'