In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sp
import anndata as ad

In [2]:
# read tsv file with gene names
genes_list = pd.read_csv("OS_scRNA_gene_index.19264.tsv", header=None, sep="\t")[0].tolist()

In [3]:
label_obs = 'cell_type'
n_genes_filter = 200

In [4]:
root_dir = "/equilibrium/datasets/TCGA-histological-data/scDataset"

# EYE DATASET

In [None]:
adata_eye = sc.read_h5ad(f"{root_dir}/eye_sc_atlas.h5ad")

In [None]:
adata_eye

AnnData object with n_obs × n_vars = 3177310 × 36406
    obs: 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intronic_reads_counted', 'donor_id', 'donor_age', 'self_reported_ethnicity_ontology_term_id', 'donor_cause_of_death', 'donor_living_at_sample_collection', 'sample_id', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_collection_method', 'tissue_source', 'tissue_type', 'suspension_derivation_process', 'suspension_dissociation_reagent', 'suspension_enriched_cell_types', 'suspension_enrichment_factors', 'suspension_uuid', 'suspension_type', 'tissue_handling_interval', 'library_id', 'assay_ontology_term_id', 'sequenced_fragment', 'institute', 'library_id_repository', 'sequencing_platform', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'majorclass', 'AC_subclass', 'AC_cluster', 'AC_celltype_number', 'BC_subclass', 'RGC_cluster', 'RGC_

## Gene Alignment

In [None]:
adata_eye.var.head()


Unnamed: 0,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length,feature_type
ENSG00000243485,False,MIR1302-2HG,NCBITaxon:9606,gene,623,lncRNA
ENSG00000237613,False,FAM138A,NCBITaxon:9606,gene,888,lncRNA
ENSG00000186092,False,OR4F5,NCBITaxon:9606,gene,2618,protein_coding
ENSG00000238009,False,ENSG00000238009,NCBITaxon:9606,gene,629,lncRNA
ENSG00000239945,False,ENSG00000239945,NCBITaxon:9606,gene,1319,lncRNA


In [None]:
adata_eye.var_names = adata_eye.var['feature_name']
adata_eye.var_names_make_unique()

adata_eye_aligned = ad.AnnData(
    obs=adata_eye.obs.copy(),
    var=pd.DataFrame(index=genes_list),
    dtype=adata_eye.X.dtype  
)

common_genes = list(set(adata_eye.var_names) & set(genes_list))

adata_eye_aligned[:, common_genes].X = adata_eye[:, common_genes].X

print(f"Aligned dataset created with {adata_eye_aligned.n_vars} genes for {adata_eye_aligned.n_obs} cells.")
print(f"Found and copied data for {len(common_genes)} common genes.")

Genes in the dataset: 19041 on 19264


## Filtering Data

In [None]:
sc.pp.filter_cells(adata_eye_aligned, min_genes=n_genes_filter)

print(f"Filterd {adata_eye.n_obs - adata_eye_aligned.n_obs} on original total {adata_eye.n_obs}")

## Verifiy the Normalization

In [None]:
X_eye = adata_eye_aligned.raw.X

max_val_eye = np.max(X_eye)

print(f"Max value in eye dataset: {max_val_eye}")

# Nervous System Dataset

In [None]:
adata_ns = sc.read_h5ad(f"{root_dir}/nervous_system_sc_atlas.h5ad")

In [None]:
adata_ns

## Gene Alignment

In [None]:
adata_ns.var.head()


In [None]:
gene_to_index = pd.Series(adata_ns.var_names, index=adata_ns.var['feature_name'])

common_genes = [g for g in genes_list if g in gene_to_index.index]
missing_genes = [g for g in genes_list if g not in gene_to_index.index]

print(f"Genes in the dataset: {len(common_genes)} on {len(genes_list) - 1}")

# Select present data
adata_present = adata_ns[:, gene_to_index[common_genes]].copy()

# Create anndata for missing genes with zero values
n_obs = adata_ns.n_obs
X_missing = sp.csr_matrix((n_obs, len(missing_genes)))  # all zero
adata_missing = ad.AnnData(
    X_missing,
    obs=adata_ns.obs.copy(),
    var=pd.DataFrame(index=missing_genes)
)

# Concatenate along the variables (genes)
adata_ns_aligned = ad.concat([adata_present, adata_missing], axis=1)
adata_ns_aligned = adata_ns_aligned[:, genes_list].copy()

## Filtering Data

In [None]:
sc.pp.filter_cells(adata_ns_aligned, min_genes=n_genes_filter)

print(f"Filtered {adata_ns.n_obs - adata_ns_aligned.n_obs} on original total {adata_ns.n_obs}")

## Verifiy the Normalization

In [None]:
X_eye = adata_ns_aligned.raw.X

max_val_eye = np.max(X_eye)

print(f"Max value in eye dataset: {max_val_eye}")

# Organoid Dataset

In [None]:
adata_organoid = sc.read_h5ad(f"{root_dir}/organoids_sc_atlas.h5ad")

In [None]:
adata_organoid

## Gene Alignment

In [None]:
adata_organoid.var.head()


In [None]:
gene_to_index = pd.Series(adata_organoid.var_names, index=adata_organoid.var['feature_name'])

common_genes = [g for g in genes_list if g in gene_to_index.index]
missing_genes = [g for g in genes_list if g not in gene_to_index.index]

print(f"Genes in the dataset: {len(common_genes)} on {len(genes_list) - 1}")

# Select present data
adata_present = adata_organoid[:, gene_to_index[common_genes]].copy()

# Create anndata for missing genes with zero values
n_obs = adata_organoid.n_obs
X_missing = sp.csr_matrix((n_obs, len(missing_genes)))  # all zero
adata_missing = ad.AnnData(
    X_missing,
    obs=adata_organoid.obs.copy(),
    var=pd.DataFrame(index=missing_genes)
)

# Concatenate along the variables (genes)
adata_organoid_aligned = ad.concat([adata_present, adata_missing], axis=1)
adata_organoid_aligned = adata_organoid_aligned[:, genes_list].copy()