### Combine scRNA-seq and snRNA-seq datasets together and downsample snRNA-seq so that the number of cells/nuclei are the same

In [1]:
import scanpy as sc 
from collections import Counter

In [2]:
%%time
scRNA_adata = sc.read_h5ad("03_scRNA_post_scvi.h5ad")
scRNA_adata.X = scRNA_adata.layers['counts']

CPU times: user 1.3 s, sys: 12.9 s, total: 14.2 s
Wall time: 14.2 s


In [3]:
num_donors = scRNA_adata.obs.donor_id.unique()
print(f"Number of scRNA-seq donors: {num_donors}")

Number of scRNA-seq donors: ['D3-Cell_3prime-v2', 'D5-Cell_3prime-v2', 'D6-Cell_3prime-v2', 'D7-Cell_3prime-v2', 'D6-Cell_3prime-v3', ..., 'HDCM3', 'HDCM5', 'HDCM4', 'HDCM7', 'HDCM6']
Length: 12
Categories (12, object): ['D3-Cell_3prime-v2', 'D5-Cell_3prime-v2', 'D6-Cell_3prime-v2', 'D6-Cell_3prime-v3', ..., 'HDCM4', 'HDCM5', 'HDCM6', 'HDCM7']


In [4]:
%%time
subsampled_snRNA_adata = sc.read_h5ad("../aggregated_analysis/07_subsampled_adata.h5ad")
subsampled_snRNA_adata.X = subsampled_snRNA_adata.layers['counts']

CPU times: user 1.91 s, sys: 7.91 s, total: 9.82 s
Wall time: 14.8 s


In [5]:
# downsample snRNA to same number of cells
further_subsampled_snRNA_adata = sc.pp.subsample(subsampled_snRNA_adata, n_obs = scRNA_adata.shape[0], copy=True)

In [6]:
adata = sc.concat([scRNA_adata, further_subsampled_snRNA_adata])

In [7]:
Counter(adata.obs.cell_or_nuclei)

Counter({'Cell': 60814, 'Nuclei': 60814})

### Save the adata

In [9]:
adata.write("03B_sc_sn_adata.h5ad")
adata

AnnData object with n_obs × n_vars = 121628 × 16115
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei', 'tech_plus_study', '_scvi_batch', '_scvi_labels', 'leiden_scVI', 'scvi_cell_type'
    obsm: 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts'