In [1]:
# I followed this vignette:
# https://tanaylab.github.io/metacells-vignettes/one-pass.html

In [1]:
import anndata as ad             # For reading/writing AnnData files
import matplotlib.pyplot as plt  # For plotting
import metacells as mc           # The Metacells package
import numpy as np               # For array/matrix operations
import pandas as pd              # For data frames
import os                        # For filesystem operations
import seaborn as sb             # For plotting
import scipy.sparse as sp        # For sparse matrices
import shutil                    # for filesystem operations
from math import hypot           # For plotting
import scanpy as sc

In [2]:
# Use SVG for scalable low-element-count diagrams.
%config InlineBackend.figure_formats = ["svg"]

# A matter of personal preference.
sb.set_style("white")

# Running operations on an inefficient layout can make code **much** slower.
# For example, summing the columns of a row-major matrix.
# By default this will just be a warning.
# We set it to be an error here to make sure the vignette does not lead you astray.
#
# Note that this only affects the Metacells package.
# Numpy will happily and silently take 100x longer for running such inefficient operations.
# At least, there's no way I can tell to create a warning or error for this;
# also, the implementation for "inefficient" operations could be *much* faster.
#
# The workaround in either case is to explicitly re-layout the 2D matrix before the operations.
# This turns out to be much faster, especially when the matrix can be reused.
# Note that numpy is also very slow when doing matrix re-layout,
# so the metacells package provides a function for doing it more efficiently.
#
# Sigh.
mc.ut.allow_inefficient_layout(False)

True

In [3]:
# cells = sc.read_h5ad('E115_full.h5ad')
# cells = sc.read_h5ad('E105_full.h5ad')
cells = sc.read_h5ad('E95_full.h5ad')

In [4]:
clean = None  # Allow it to be gc-ed
mc.ut.set_name(cells, "hca_bm.one-pass.preliminary.cells")
print(f"Input: {cells.n_obs} cells, {cells.n_vars} genes")

Input: 111078 cells, 24552 genes


In [5]:
# human vs mouse genes
LATERAL_GENE_NAMES = [
    "ACSM3", "ANP32B", "APOE", "AURKA", "B2M", "BIRC5", "BTG2", "CALM1", "CD63", "CD69", "CDK4",
    "CENPF", "CENPU", "CENPW", "CH17-373J23.1", "CKS1B", "CKS2", "COX4I1", "CXCR4", "DNAJB1",
    "DONSON", "DUSP1", "DUT", "EEF1A1", "EEF1B2", "EIF3E", "EMP3", "FKBP4", "FOS", "FOSB", "FTH1",
    "G0S2", "GGH", "GLTSCR2", "GMNN", "GNB2L1", "GPR183", "H2AFZ", "H3F3B", "HBM", "HIST1H1C",
    "HIST1H2AC", "HIST1H2BG", "HIST1H4C", "HLA-A", "HLA-B", "HLA-C", "HLA-DMA", "HLA-DMB",
    "HLA-DPA1", "HLA-DPB1", "HLA-DQA1", "HLA-DQB1", "HLA-DRA", "HLA-DRB1", "HLA-E", "HLA-F", "HMGA1",
    "HMGB1", "HMGB2", "HMGB3", "HMGN2", "HNRNPAB", "HSP90AA1", "HSP90AB1", "HSPA1A", "HSPA1B",
    "HSPA6", "HSPD1", "HSPE1", "HSPH1", "ID2", "IER2", "IGHA1", "IGHA2", "IGHD", "IGHG1", "IGHG2",
    "IGHG3", "IGHG4", "IGHM", "IGKC", "IGKV1-12", "IGKV1-39", "IGKV1-5", "IGKV3-15", "IGKV4-1",
    "IGLC2", "IGLC3", "IGLC6", "IGLC7", "IGLL1", "IGLL5", "IGLV2-34", "JUN", "JUNB", "KIAA0101",
    "LEPROTL1", "LGALS1", "LINC01206", "LTB", "MCM3", "MCM4", "MCM7", "MKI67", "MT2A", "MYL12A",
    "MYL6", "NASP", "NFKBIA", "NUSAP1", "PA2G4", "PCNA", "PDLIM1", "PLK3", "PPP1R15A", "PTMA",
    "PTTG1", "RAN", "RANBP1", "RGCC", "RGS1", "RGS2", "RGS3", "RP11-1143G9.4", "RP11-160E2.6",
    "RP11-53B5.1", "RP11-620J15.3", "RP5-1025A1.3", "RP5-1171I10.5", "RPS10", "RPS10-NUDT3", "RPS11",
    "RPS12", "RPS13", "RPS14", "RPS15", "RPS15A", "RPS16", "RPS17", "RPS18", "RPS19", "RPS19BP1",
    "RPS2", "RPS20", "RPS21", "RPS23", "RPS24", "RPS25", "RPS26", "RPS27", "RPS27A", "RPS27L",
    "RPS28", "RPS29", "RPS3", "RPS3A", "RPS4X", "RPS4Y1", "RPS4Y2", "RPS5", "RPS6", "RPS6KA1",
    "RPS6KA2", "RPS6KA2-AS1", "RPS6KA3", "RPS6KA4", "RPS6KA5", "RPS6KA6", "RPS6KB1", "RPS6KB2",
    "RPS6KC1", "RPS6KL1", "RPS7", "RPS8", "RPS9", "RPSA", "RRM2", "SMC4", "SRGN", "SRSF7", "STMN1",
    "TK1", "TMSB4X", "TOP2A", "TPX2", "TSC22D3", "TUBA1A", "TUBA1B", "TUBB", "TUBB4B", "TXN", "TYMS",
    "UBA52", "UBC", "UBE2C", "UHRF1", "YBX1", "YPEL5", "ZFP36", "ZWINT"
]
LATERAL_GENE_PATTERNS = ["Rp[ls].*"]  # Ribosomal

In [6]:
# This will mark as "lateral_gene" any genes that match the above, if they exist in the clean dataset.
mc.pl.mark_lateral_genes(
    cells,
    lateral_gene_names=LATERAL_GENE_NAMES,
    lateral_gene_patterns=LATERAL_GENE_PATTERNS,
)

lateral_gene_mask = mc.ut.get_v_numpy(cells, "lateral_gene")
lateral_gene_names = set(cells.var_names[lateral_gene_mask])
print(sorted([
    name for name in lateral_gene_names
    if not name.startswith("Rpl") and not name.startswith("Rps")
]))
print(f"""and {len([
    name for name in lateral_gene_names if name.startswith("Rpl") or name.startswith("Rps")
])} Rp[ls].* genes""")

set hca_bm.one-pass.preliminary.cells.var[lateral_gene]: 205 true (0.835%) out of 24552 bools


['Acsm3', 'Anp32b', 'Apoe', 'Aurka', 'B2m', 'Birc5', 'Btg2', 'Calm1', 'Cd63', 'Cd69', 'Cdk4', 'Cenpf', 'Cenpu', 'Cenpw', 'Cks1b', 'Cks2', 'Cox4i1', 'Cxcr4', 'Dnajb1', 'Donson', 'Dusp1', 'Dut', 'Eef1a1', 'Eef1b2', 'Eif3e', 'Emp3', 'Fkbp4', 'Fos', 'Fosb', 'Fth1', 'G0s2', 'Ggh', 'Gmnn', 'Gpr183', 'H3f3b', 'Hmga1', 'Hmgb1', 'Hmgb2', 'Hmgn2', 'Hnrnpab', 'Hsp90aa1', 'Hsp90ab1', 'Hspa1a', 'Hspa1b', 'Hspd1', 'Hspe1', 'Hsph1', 'Id2', 'Ier2', 'Igll1', 'Jun', 'Junb', 'Leprotl1', 'Lgals1', 'Ltb', 'Mcm3', 'Mcm4', 'Mcm7', 'Mki67', 'Myl12a', 'Myl6', 'Nasp', 'Nfkbia', 'Nusap1', 'Pa2g4', 'Pcna', 'Pdlim1', 'Plk3', 'Ppp1r15a', 'Ptma', 'Pttg1', 'Ran', 'Ranbp1', 'Rgcc', 'Rgs1', 'Rgs2', 'Rgs3', 'Rrm2', 'Smc4', 'Srgn', 'Srsf7', 'Stmn1', 'Tk1', 'Top2a', 'Tpx2', 'Tuba1a', 'Tuba1b', 'Tubb4b', 'Tyms', 'Uba52', 'Ubc', 'Ube2c', 'Uhrf1', 'Ybx1', 'Ypel5', 'Zfp36', 'Zwint']
and 108 Rp[ls].* genes


In [7]:
NOISY_GENE_NAMES = [
    "CCL3", "CCL4", "CCL5", "CXCL8", "DUSP1", "FOS", "G0S2", "HBB", "HIST1H4C", "IER2", "IGKC",
    "IGLC2", "JUN", "JUNB", "KLRB1", "MT2A", "RPS26", "RPS4Y1", "TRBC1", "TUBA1B", "TUBB"
]

In [8]:
# This will mark as "noisy_gene" any genes that match the above, if they exist in the clean dataset.
mc.pl.mark_noisy_genes(cells, noisy_gene_names=NOISY_GENE_NAMES)

set hca_bm.one-pass.preliminary.cells.var[noisy_gene]: 12 true (0.04888%) out of 24552 bools


In [9]:
# Either use the guesstimator:
max_parallel_piles = mc.pl.guess_max_parallel_piles(cells)
# Or, if running out of memory manually override:
# max_paralle_piles = ...
max_parallel_piles=10
print(max_parallel_piles)
mc.pl.set_max_parallel_piles(max_parallel_piles)

10


In [10]:
with mc.ut.progress_bar():
    mc.pl.divide_and_conquer_pipeline(cells, random_seed=0)

Detect rare gene modules...
100%|███████████████████████████████████████████████████████████████████▉[04:08]


In [11]:
metacells = \
    mc.pl.collect_metacells(cells, name="hca_bm.one-pass.preliminary.metacells", random_seed=0)
print(f"Preliminary: {metacells.n_obs} metacells, {metacells.n_vars} genes")

set hca_bm.one-pass.preliminary.metacells.obs[grouped]: 1972 int64s
set hca_bm.one-pass.preliminary.metacells.obs[total_umis]: 1972 int64s
set hca_bm.one-pass.preliminary.metacells.layers[total_umis]: ndarray 1972 X 24552 float32s
set hca_bm.one-pass.preliminary.metacells.obs[__zeros_downsample_umis]: 1972 int64s
set hca_bm.one-pass.preliminary.metacells.layers[zeros]: ndarray 1972 X 24552 int32s
set hca_bm.one-pass.preliminary.cells.obs[metacell_name]: 111078 <U8s
set hca_bm.one-pass.preliminary.metacells.var[features]: 24552 objects
set hca_bm.one-pass.preliminary.metacells.var[gene_names]: 24552 objects
set hca_bm.one-pass.preliminary.metacells.var[lateral_gene]: 205 true (0.835%) out of 24552 bools
set hca_bm.one-pass.preliminary.metacells.var[noisy_gene]: 12 true (0.04888%) out of 24552 bools
set hca_bm.one-pass.preliminary.metacells.var[selected_gene]: 4997 true (20.35%) out of 24552 bools
set hca_bm.one-pass.preliminary.metacells.var[rare_gene]: 0 true (0%) out of 24552 bools
se

Preliminary: 1972 metacells, 24552 genes


In [12]:
# It's called prelimary because he does an additional round of doublet removal (the -1 metacells), I'll ingore this
# since it is only very few cells for me that are called doublets 

In [13]:
# cells.obs.to_pickle('E115_metacells.pkl')
# cells.obs.to_pickle('E105_metacells.pkl')
cells.obs.to_pickle('E95_metacells.pkl')