In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sp
import anndata as ad

In [2]:
# read tsv file with gene names
genes_list = pd.read_csv("OS_scRNA_gene_index.19264.tsv", header=None, sep="\t")[0].tolist()

In [3]:
label_obs = 'cell_type'
n_genes_filter = 200

# EYE DATASET

In [4]:
adata = sc.read_h5ad("dataset/organoid_single_cell_atlas.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 1920782 × 36720
    obs: 'assay_differentiation', 'assay_type_differentiation', 'bio_sample', 'cell_line', 'cell_type_original', 'gm', 'id', 'individual', 'state_exact', 'suspension_type', 'tech_sample', 'treatment', 'organoid_age_days', 'publication', 'doi', 'batch', 'annot_level_1', 'annot_level_2', 'annot_level_3_rev2', 'annot_level_4_rev2', 'annot_region_rev2', 'annot_ntt_rev2', 'Hallmark_Glycolysis', 'hnoca_core', 'annot_level_2_extended', 'tissue_type', 'sex_ontology_term_id', 'donor_id', 'assay_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'disease_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'gene_length', 'highly_variable', 'highly_variable_rank', 'highly_variable_nbatches', 'feature_is_filtered', 'feature_name'

## Gene Alignment

In [6]:
adata.var.head()


Unnamed: 0_level_0,gene_length,highly_variable,highly_variable_rank,highly_variable_nbatches,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length,feature_type
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000000003,3796,False,2243.0,118,False,TSPAN6,NCBITaxon:9606,gene,2396,protein_coding
ENSG00000000005,1205,True,738.5,114,False,TNMD,NCBITaxon:9606,gene,873,protein_coding
ENSG00000000419,3004,False,2032.0,9,False,DPM1,NCBITaxon:9606,gene,1262,protein_coding
ENSG00000000457,6308,False,2532.0,15,False,SCYL3,NCBITaxon:9606,gene,2916,protein_coding
ENSG00000000460,4355,False,2444.0,37,False,C1orf112,NCBITaxon:9606,gene,2661,protein_coding


In [None]:
gene_to_index = pd.Series(adata.var_names, index=adata.var['feature_name'])

common_genes = [g for g in genes_list if g in gene_to_index.index]
missing_genes = [g for g in genes_list if g not in gene_to_index.index]

print(f"Genes in the dataset: {len(common_genes)} on {len(genes_list) - 1}")

# Select present data
adata_present = adata[:, gene_to_index[common_genes]].copy()

# Create anndata for missing genes with zero values
n_obs = adata.n_obs
X_missing = sp.csr_matrix((n_obs, len(missing_genes)))  # all zero
adata_missing = ad.AnnData(
    X_missing,
    obs=adata.obs.copy(),
    var=pd.DataFrame(index=missing_genes)
)

# Concatenate along the variables (genes)
adata_aligned = ad.concat([adata_present, adata_missing], axis=1)
adata_aligned = adata_aligned[:, genes_list].copy()

Genes in the dataset: 19045 on 19264


## Filtering Data

In [None]:
sc.pp.filter_cells(adata_aligned, min_genes=n_genes_filter)

print(f"Filterd {adata.n_obs - adata_aligned.n_obs} on original total {adata.n_obs}")

## Verifiy the Normalization

In [None]:
X_eye = adata_eye_aligned.raw.X

max_val_eye = np.max(X_eye)

print(f"Max value in eye dataset: {max_val_eye}")