In [1]:
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# GEX

In [2]:
adata = sc.read_h5ad("/Users/dominik.klein/data/pancreas_2022.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 16918 × 14663
    obs: 'n_counts', 'sample', 'int_id', 'reporter', 'log_cell_probs', 'cell_barcodes_du', 'cell_barcodes_cr', 'cell_barcodes', 'log_counts', 'n_genes', 'log_genes', 'mt_frac', 'rp_frac', 'ambi_frac', 'total_counts_rank', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'leiden', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ATAC', 'nFeature_ATAC', 'nucleosome_signal', 'nucleosome_percentile', 'TSS.enrichment', 'TSS.percentile', 'log_nCount_ATAC', 'log_nFeature_ATAC', 'scrublet_scores', 'scrublet_doublets', 'scrublet_doublets_cat', 'dd_doublets', 'dd_scores', 'dd_log_p_values', 'dd_voting_average', 'dd_-log_p_values', 'dd_doublets_cat', 'pANN', 'DF_classifications_1', 'DF_classifications_2', 'pANN_SCT', 'DF_SCT_classifications_1', 'DF_SCT_classifications_2', 'df_doublets',

In [4]:
# "sample" and "batch" are 1-to-1
((adata.obs["sample"]=="E14.5") == (adata.obs["batch"]=="0")).all()

True

## ATAC

In [5]:
bdata = sc.read_h5ad("/Users/dominik.klein/data/E14-E15_lsi_peaks_to_adata.h5ad")

In [6]:
adata.obs

Unnamed: 0,n_counts,sample,int_id,reporter,log_cell_probs,cell_barcodes_du,cell_barcodes_cr,cell_barcodes,log_counts,n_genes,...,leiden_sub12,tmp,leiden_sub13,leiden_sub14,leiden_sub15,leiden_sub16,leiden_sub17,leiden_sub18,leiden_sub19,celltype
AAACAGCCAACAGCCT-1-0,8457.0,E14.5,4xx,Neurog3-Venus-Fusion,-12538.516611,1,1,2,9.042749,3651,...,17,0,17,17,17,17,17,17,170,Fev+ Alpha
AAACAGCCAACCCTCC-1-0,7841.0,E14.5,4xx,Neurog3-Venus-Fusion,-10665.635853,1,1,2,8.967122,3120,...,60,0,60,60,60,60,60,60,60,Fev+
AAACAGCCACCTGTAA-1-0,9492.0,E14.5,4xx,Neurog3-Venus-Fusion,-15545.684333,1,1,2,9.158205,3938,...,8,0,8,80,80,80,80,80,80,Imm. Acinar
AAACAGCCACTAAGCC-1-0,7915.0,E14.5,4xx,Neurog3-Venus-Fusion,-12149.266909,1,1,2,8.976515,3318,...,60,0,60,60,60,60,60,60,60,Fev+
AAACAGCCAGGATAAC-1-0,9193.0,E14.5,4xx,Neurog3-Venus-Fusion,-11208.509422,1,1,2,9.126198,3666,...,41,0,41,41,41,41,41,41,41,Ngn3 high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCTTGTTCG-1-1,9644.0,E15.5,4xx,Neurog3-Venus-Fusion,-17196.982412,1,1,2,9.174091,4703,...,130,0,130,130,130,130,130,130,130,Ductal
TTTGTTGGTATTGAGT-1-1,8914.0,E15.5,4xx,Neurog3-Venus-Fusion,-13303.741618,1,1,2,9.095378,3772,...,50,0,50,50,50,50,50,50,50,Ngn3 low
TTTGTTGGTCCTAAGA-1-1,9689.0,E15.5,4xx,Neurog3-Venus-Fusion,-16358.359709,1,1,2,9.178746,4540,...,130,0,130,130,130,130,130,130,130,Ductal
TTTGTTGGTTAACAGT-1-1,9570.0,E15.5,4xx,Neurog3-Venus-Fusion,-16296.427054,1,1,2,9.166389,4502,...,14,0,14,14,140,140,140,140,140,Mat. Acinar


In [7]:
bdata.obs

E14-5_AAACAGCCAACAGCCT-1
E14-5_AAACAGCCAACCCTCC-1
E14-5_AAACAGCCACCTGTAA-1
E14-5_AAACAGCCACTAAGCC-1
E14-5_AAACAGCCAGGATAAC-1
...
E15-5_TTTGTGTTCTTGTTCG-1
E15-5_TTTGTTGGTATTGAGT-1
E15-5_TTTGTTGGTCCTAAGA-1
E15-5_TTTGTTGGTTAACAGT-1
E15-5_TTTGTTGGTTGTTGCT-1


In [8]:
import re 
def rename_index(x):
    splits = re.split('_',x["old_index"])
    cell_tag = re.split("-", splits[1])[0]
    suffix = "-1-0" if splits[0] == "E14-5" else "-1-1"
    return cell_tag+suffix

bdata.obs["old_index"] = bdata.obs.index
bdata.obs["index_adapted"] = bdata.obs.apply(rename_index, axis=1)

In [11]:
# make sure the rows are in the same order for both modalities
set(adata.obs.index).issubset(set(bdata.obs["index_adapted"].values))


True

In [12]:
# we want to have the same indices in both anndata objects
bdata.obs.set_index("index_adapted", inplace=True)

In [13]:
# filter the cells in the ATAC dataset
bdata_filtered = bdata[bdata.obs.index.isin(adata.obs.index)].copy()

In [14]:
(bdata_filtered.obs.index == adata.obs.index).all()

True

In [24]:
atac_cols = [
    "sample",
    "celltype",
    "nCount_ATAC",
    "nFeature_ATAC",
    "nucleosome_signal",
    "nucleosome_percentile",
    "TSS.enrichment",
    "TSS.percentile",
    "log_nCount_ATAC",
    "log_nFeature_ATAC",
    "S_score",
    "G2M_score",
    "phase",
    "proliferation"]

In [25]:
bdata_filtered.obs[atac_cols] = adata.obs[atac_cols]

In [27]:
bdata_filtered.obsm["X_umap"] = adata.obsm["X_umap"]
bdata_filtered.obsm["X_umap_GEX"] = adata.obsm["X_umap_GEX"]
bdata_filtered.obsm["X_umap_ATAC"] = adata.obsm["X_umap_ATAC"]

In [28]:
bdata_filtered

AnnData object with n_obs × n_vars = 16918 × 228259
    obs: 'old_index', 'celltype', 'nCount_ATAC', 'nFeature_ATAC', 'nucleosome_signal', 'nucleosome_percentile', 'TSS.enrichment', 'TSS.percentile', 'log_nCount_ATAC', 'log_nFeature_ATAC', 'sample'
    obsm: 'lsi_full', 'lsi_red', 'lsi_red2', 'X_umap', 'X_umap_GEX', 'X_umap_ATAC'

In [31]:
bdata_filtered.write("/Users/dominik.klein/data/E14-E15_lsi_peaks_to_adata.h5ad")


# Construct a joint dataset

In [22]:
#import anndata
#cdata = anndata.concat([adata, bdata], axis=1, join="inner")

In [23]:
#gex_set = adata.var.index
#def get_feature_type(x):
#    return "GEX" if x["ind"] in gex_set else "ATAC"

#cdata.var["ind"] = cdata.var.index
#cdata.var["feature_type"] = cdata.var.apply(get_feature_type, axis=1)

#del cdata.var["ind"]

In [37]:
#cdata.obs = adata.obs

In [133]:
#cdata.obsm["GEX_pca"] = adata.obsm["X_pca"]
#cdata.obsm["ATAC_pca"] = bdata.obsm["X_pca"]
#cdata.obsm["GEX_umap"] = adata.obsm["X_umap"]
#cdata.obsm["ATAC_umap"] = bdata.obsm["X_umap"]
#cdata.uns["GEX_neighbors"] = adata.uns["neighbors"]
#cdata.uns["ATAC_neighbors"] = bdata.uns["neighbors"]
#cdata.obsm[""]

In [137]:
#sc.pp.pca(cdata)

In [140]:
#cdata.write("/storage/groups/ml01/workspace/dominik.klein/scMultiome_NVF/scMultiome_NVF_E14-E15/adata_preprocessed.h5ad")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'celltype' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'feature_type' as categorical
