In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sp
import anndata as ad

In [None]:
# read tsv file with gene names
genes_list = pd.read_csv("OS_scRNA_gene_index.19264.tsv", header=None, sep="\t")[0].tolist()

In [None]:
label_obs = 'cell_type'
n_genes_filter = 200

# EYE DATASET

In [4]:
adata = sc.read_h5ad("dataset/nervous_system_single_cell_atlas.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 2480956 × 59236
    obs: 'ROIGroup', 'ROIGroupCoarse', 'ROIGroupFine', 'roi', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'suspension_type', 'dissection', 'fraction_mitochondrial', 'fraction_unspliced', 'cell_cycle_score', 'total_genes', 'total_UMIs', 'sample_id', 'supercluster_term', 'cluster_id', 'subcluster_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'Biotype', 'Chromosome', 'End', 'Gene', 'Start', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'batch_condition', 'citation', 'organism', 'organism_ontology_term_id', 'schema_reference', 'schema_version', 'title'
    obsm: '

## Gene Alignment

In [6]:
adata.var.head()


Unnamed: 0,Biotype,Chromosome,End,Gene,Start,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length,feature_type
ENSG00000000003,,chrX,100639991,TSPAN6,100627108,False,TSPAN6,NCBITaxon:9606,gene,2396,protein_coding
ENSG00000000005,,chrX,100599885,TNMD,100584936,False,TNMD,NCBITaxon:9606,gene,873,protein_coding
ENSG00000000419,,chr20,50958555,DPM1,50934867,False,DPM1,NCBITaxon:9606,gene,1262,protein_coding
ENSG00000000457,,chr1,169894267,SCYL3,169849631,False,SCYL3,NCBITaxon:9606,gene,2916,protein_coding
ENSG00000000460,,chr1,169854080,C1orf112,169662007,False,C1orf112,NCBITaxon:9606,gene,2661,protein_coding


In [None]:
gene_to_index = pd.Series(adata.var_names, index=adata.var['feature_name'])

common_genes = [g for g in genes_list if g in gene_to_index.index]
missing_genes = [g for g in genes_list if g not in gene_to_index.index]

print(f"Genes in the dataset: {len(common_genes)} on {len(genes_list) - 1}")

# Select present data
adata_present = adata[:, gene_to_index[common_genes]].copy()

# Create anndata for missing genes with zero values
n_obs = adata.n_obs
X_missing = sp.csr_matrix((n_obs, len(missing_genes)))  # all zero
adata_missing = ad.AnnData(
    X_missing,
    obs=adata.obs.copy(),
    var=pd.DataFrame(index=missing_genes)
)

# Concatenate along the variables (genes)
adata_aligned = ad.concat([adata_present, adata_missing], axis=1)
adata_aligned = adata_aligned[:, genes_list].copy()

Genes in the dataset: 18862 on 19264


## Filtering Data

In [None]:
sc.pp.filter_cells(adata_aligned, min_genes=n_genes_filter)

print(f"Filterd {adata.n_obs - adata_aligned.n_obs} on original total {adata.n_obs}")

## Verifiy the Normalization

In [None]:
X_eye = adata_eye_aligned.raw.X

max_val_eye = np.max(X_eye)

print(f"Max value in eye dataset: {max_val_eye}")