# Monocle 2 pseudotime
related to Figure.1



## Input
- An AnnData file (`.h5ad`) available locally (see `CONFIG`).
- Cell metadata (`obs`) must include a column used to color cells along the trajectory (default: `subtype`).
- If a sample/batch column is present (default candidates: `S_ID`, `orig.ident`), it will be mapped to `orig.ident` for `residualModelFormulaStr`.

## Outputs (written to disk)
- `output/monocle2_Fibro/objects/cds_monocle2_ddrtree.rds`
- `output/monocle2_Fibro/figures/trajectory_*.pdf`
- `output/monocle2_Fibro/figures/pseudotime_heatmap_top_genes.pdf`
- `output/monocle2_Fibro/figures/genes_in_pseudotime_GOI.pdf` (if GOI present)
- `output/monocle2_Fibro/tables/DEG_along_pseudotime.csv`
- `output/monocle2_Fibro/tables/cells_pseudotime_ddrtree.csv`


In [None]:
## See CONFIG for all analysis parameters and paths.
config <- list(
  input_h5ad = file.path("data", "1_Fibro_count_new.h5ad"),
  out_dir = file.path("output", "monocle2_Fibro"),

  # Data conventions
  assay_name = "RNA",
  counts_layer_preference = c("counts", "X"),

  # Required metadata column for coloring / grouping
  group_key = "subtype",

  # Optional metadata columns used to define `orig.ident`
  batch_key_candidates = c("S_ID", "orig.ident"),
  residual_formula = "~ orig.ident",

  # Gene filtering / ordering
  min_cells_expressed = 10L,
  ordering_gene_strategy = "dispersion",  # "dispersion" or "by_group"
  ordering_genes_max = 2000L,
  ordering_qval = 1e-3,

  # DDRTree
  reduce_num_dim = 50L,
  max_components = 2L,
  reduction_method = "DDRTree",

  # Pseudotime gene test / plots
  pseudotime_test_max_genes = 2000L,
  pseudotime_heatmap_clusters = 6L,
  goi = c("FAP", "CCL2", "POSTN"),

  # Reproducibility
  seed = 20251102L
)

dir.create(config$out_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(file.path(config$out_dir, "figures"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path(config$out_dir, "tables"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path(config$out_dir, "objects"), showWarnings = FALSE, recursive = TRUE)

set.seed(config$seed)

suppressPackageStartupMessages({
  library(monocle)
  library(Biobase)
  library(Matrix)
  library(ggplot2)
  library(dplyr)
  library(anndataR)
  library(SeuratObject)
})

cat("Key package versions\n")
pkgs <- c("monocle", "DDRTree", "igraph", "Matrix", "ggplot2", "dplyr", "anndataR", "SeuratObject")
for (p in pkgs) {
  if (requireNamespace(p, quietly = TRUE)) {
    cat(sprintf("  %-12s %s\n", p, as.character(packageVersion(p))))
  }
}


In [None]:
## Load input (.h5ad) and build a Monocle 2 CellDataSet.
stopifnot(file.exists(config$input_h5ad))

seu <- read_h5ad(config$input_h5ad, as = "Seurat")
stopifnot(inherits(seu, "Seurat"))

meta <- seu[[]]
cat(sprintf("Loaded: %s\nCells: %d\nGenes: %d\n", config$input_h5ad, nrow(meta), nrow(seu)))

# Ensure an `orig.ident` column exists (used by residualModelFormulaStr)
batch_key <- intersect(config$batch_key_candidates, colnames(meta))[1]
if (is.na(batch_key)) {
  meta$orig.ident <- "sample1"
} else if (batch_key != "orig.ident") {
  meta$orig.ident <- meta[[batch_key]]
}
seu@meta.data <- meta

# Required grouping key
if (!config$group_key %in% colnames(seu@meta.data)) {
  stop("Missing required column in cell metadata: ", config$group_key)
}

# Select assay and a counts layer
assay_name <- if (config$assay_name %in% names(seu@assays)) config$assay_name else DefaultAssay(seu)
assay_obj <- seu[[assay_name]]

avail_layers <- tryCatch(Layers(assay_obj), error = function(e) character(0))
layer_use <- intersect(config$counts_layer_preference, avail_layers)[1]

if (!is.na(layer_use) && length(layer_use) == 1) {
  counts <- LayerData(assay_obj, layer = layer_use)
} else {
  # Fallback for older Seurat objects: try the legacy counts slot
  counts <- GetAssayData(seu, assay = assay_name, slot = "counts")
}

if (!inherits(counts, "dgCMatrix")) counts <- as(counts, "dgCMatrix")
if (anyDuplicated(rownames(counts)) > 0) rownames(counts) <- make.unique(rownames(counts))

stopifnot(identical(colnames(counts), rownames(seu@meta.data)))
cat(sprintf("Using assay: %s | counts layer: %s\n", assay_name, ifelse(is.na(layer_use), "<legacy counts>", layer_use)))
cat(sprintf("Counts matrix: %d genes x %d cells\n", nrow(counts), ncol(counts)))

pd <- new("AnnotatedDataFrame", data = seu@meta.data)
fd <- new("AnnotatedDataFrame", data = data.frame(
  gene_short_name = rownames(counts),
  row.names = rownames(counts),
  stringsAsFactors = FALSE
))

cds <- newCellDataSet(
  counts,
  phenoData = pd,
  featureData = fd,
  expressionFamily = negbinomial.size(),
  lowerDetectionLimit = 1
)

cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds, cores = 1)


In [None]:
## Ordering genes, DDRTree embedding, and cell ordering (pseudotime).
cds <- detectGenes(cds, min_expr = 0.1)
fdat <- fData(cds)
expressed_genes <- rownames(subset(fdat, num_cells_expressed >= config$min_cells_expressed))

if (length(expressed_genes) == 0) {
  stop("No expressed genes found with min_cells_expressed = ", config$min_cells_expressed)
}

if (identical(config$ordering_gene_strategy, "by_group")) {
  if (!config$group_key %in% colnames(pData(cds))) stop("group_key not present in pData(cds)")
  de_by_group <- differentialGeneTest(
    cds[expressed_genes, ],
    fullModelFormulaStr = paste0("~", config$group_key),
    cores = 1
  )
  ordering_genes <- rownames(subset(de_by_group, qval <= config$ordering_qval))
} else {
  disp_tab <- dispersionTable(cds)
  ordering_genes <- disp_tab %>%
    filter(gene_id %in% expressed_genes) %>%
    arrange(desc(dispersion_empirical)) %>%
    head(config$ordering_genes_max) %>%
    pull(gene_id)
}

ordering_genes <- intersect(ordering_genes, rownames(cds))
if (length(ordering_genes) < 50) {
  stop("Too few ordering genes (n = ", length(ordering_genes), "). Check filtering thresholds.")
}

cds <- setOrderingFilter(cds, ordering_genes)
cat(sprintf("Ordering genes: %d\n", length(ordering_genes)))

cds <- reduceDimension(
  cds,
  max_components = config$max_components,
  num_dim = config$reduce_num_dim,
  reduction_method = config$reduction_method,
  residualModelFormulaStr = config$residual_formula,
  verbose = FALSE
)
cds <- orderCells(cds)

saveRDS(cds, file.path(config$out_dir, "objects", "cds_monocle2_ddrtree.rds"))


In [None]:
## Key trajectory plots.
fig_dir <- file.path(config$out_dir, "figures")

p_state <- plot_cell_trajectory(cds, color_by = "State")
p_ptime <- plot_cell_trajectory(cds, color_by = "Pseudotime")
p_group <- plot_cell_trajectory(cds, color_by = config$group_key)

ggsave(file.path(fig_dir, "trajectory_by_state.pdf"), p_state, width = 6, height = 4)
ggsave(file.path(fig_dir, "trajectory_by_pseudotime.pdf"), p_ptime, width = 6, height = 4)
ggsave(file.path(fig_dir, paste0("trajectory_by_", config$group_key, ".pdf")), p_group, width = 6, height = 4)


In [None]:
## Genes associated with pseudotime (differentialGeneTest) + heatmap + GOI trends.
table_dir <- file.path(config$out_dir, "tables")
fig_dir <- file.path(config$out_dir, "figures")

disp_tab <- dispersionTable(cds)
genes_test <- disp_tab %>%
  filter(gene_id %in% rownames(cds)) %>%
  arrange(desc(dispersion_empirical)) %>%
  head(config$pseudotime_test_max_genes) %>%
  pull(gene_id)

pt_de <- differentialGeneTest(
  cds[genes_test, ],
  fullModelFormulaStr = "~sm.ns(Pseudotime)",
  cores = 1
)
pt_de <- pt_de[order(pt_de$qval), ]
write.csv(pt_de, file.path(table_dir, "DEG_along_pseudotime.csv"))

sig_genes <- rownames(subset(pt_de, qval < 0.05))
sig_genes <- head(sig_genes, 1000)

pdf(file.path(fig_dir, "pseudotime_heatmap_top_genes.pdf"), width = 8, height = 10)
plot_pseudotime_heatmap(
  cds[sig_genes, ],
  num_clusters = config$pseudotime_heatmap_clusters,
  show_rownames = FALSE,
  return_heatmap = FALSE,
  cores = 1
)
dev.off()

goi_present <- intersect(config$goi, rownames(cds))
if (length(goi_present) > 0) {
  p_goi <- plot_genes_in_pseudotime(cds[goi_present, ], color_by = config$group_key)
  ggsave(file.path(fig_dir, "genes_in_pseudotime_GOI.pdf"), p_goi, width = 7, height = 8)
}


In [None]:
## Export per-cell pseudotime and DDRTree coordinates.
table_dir <- file.path(config$out_dir, "tables")

coords <- reducedDimS(cds)

# Monocle 2 embeddings are sometimes stored as dims x cells; transpose if needed.
if (ncol(coords) == nrow(pData(cds)) && nrow(coords) != nrow(pData(cds))) {
  coords <- t(coords)
}
stopifnot(nrow(coords) == nrow(pData(cds)))
stopifnot(ncol(coords) >= 2)

cells_tbl <- data.frame(
  cell_id = rownames(pData(cds)),
  DDRTree_1 = coords[, 1],
  DDRTree_2 = coords[, 2],
  State = pData(cds)$State,
  Pseudotime = pData(cds)$Pseudotime,
  group = pData(cds)[[config$group_key]],
  orig.ident = pData(cds)$orig.ident,
  stringsAsFactors = FALSE
)

out_csv <- file.path(table_dir, "cells_pseudotime_ddrtree.csv")
write.csv(cells_tbl, out_csv, row.names = FALSE)

cat("Outputs written to:\n")
cat("  ", normalizePath(config$out_dir), "\n")
cat("  ", normalizePath(out_csv), "\n")
