In [3]:
# Cell 1: Setup (paths + libraries) — repo-local, reproducible

source("../setup/r_bootstrap.R")

Repo root: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea 
No issues found -- the project is in a consistent state.
renv is synchronized.
✅ R bootstrap complete.


In [4]:
# Cell 2: Define Paths and Load Data

options(warnPartialMatchDollar = TRUE)

meta_fp <- file.path(REPO_ROOT, "data/processed/preprocessing_outputs/metadata_LumA_IDC_Tumor_vs_AllNormals.tsv")
expr_fp <- file.path(REPO_ROOT, "data/processed/preprocessing_outputs/expr_LumA_IDC_Tumor_vs_AllNormals.tsv")

cat("meta_fp:", meta_fp, "\n")
cat("expr_fp:", expr_fp, "\n")
cat("meta exists:", file.exists(meta_fp), "\n")
cat("expr exists:", file.exists(expr_fp), "\n")

meta <- read.delim(meta_fp, stringsAsFactors = FALSE, check.names = FALSE)
expr <- read.delim(expr_fp, row.names = 1, check.names = FALSE)

print(colnames(meta))
cat("Metadata:", nrow(meta), "×", ncol(meta), "\n")
cat("Expression:", nrow(expr), "×", ncol(expr), "\n")
print(table(meta$Group))

stopifnot("Sample" %in% colnames(meta))
stopifnot(all(colnames(expr) %in% meta$Sample))

meta <- meta[match(colnames(expr), meta$Sample), ]
stopifnot(all(meta$Sample == colnames(expr)))

meta_fp: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/data/processed/preprocessing_outputs/metadata_LumA_IDC_Tumor_vs_AllNormals.tsv 
expr_fp: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/data/processed/preprocessing_outputs/expr_LumA_IDC_Tumor_vs_AllNormals.tsv 
meta exists: TRUE 
expr exists: TRUE 
[1] "Sample"            "Group"             "molecular_subtype"
[4] "sample_type"       "histological_type"
Metadata: 413 × 5 
Expression: 20530 × 413 

Normal  Tumor 
   114    299 


In [5]:
# Cell 3: Data Pre-processing for limma (strict alignment)

# 1) Verify exact overlap
missing_in_expr <- setdiff(meta$Sample, colnames(expr))
missing_in_meta <- setdiff(colnames(expr), meta$Sample)

if (length(missing_in_expr) > 0) {
  cat("Samples in meta but missing in expr:", length(missing_in_expr), "\n")
  print(head(missing_in_expr, 10))
  stop("Metadata contains samples not present in expression matrix.")
}

if (length(missing_in_meta) > 0) {
  cat("Samples in expr but missing in meta:", length(missing_in_meta), "\n")
  print(head(missing_in_meta, 10))
  stop("Expression matrix contains samples not present in metadata.")
}

# 2) Reorder expression columns to match metadata order exactly
expr <- expr[, meta$Sample, drop = FALSE]

# 3) Confirm exact alignment
stopifnot(all(colnames(expr) == meta$Sample))

# 4) Build design matrix (Normal = reference)
group <- factor(meta$Group, levels = c("Normal", "Tumor"))
design <- model.matrix(~ group)
colnames(design) <- c("Intercept", "Tumor_vs_Normal")

cat("Design matrix columns:\n")
print(colnames(design))
print(table(group))

Design matrix columns:
[1] "Intercept"       "Tumor_vs_Normal"
group
Normal  Tumor 
   114    299 


In [6]:
# Cell 4: limma-trend + save DEG results

suppressPackageStartupMessages({
  library(limma)
})

# Ensure matrix
expr_mat <- as.matrix(expr)

# Fit model
fit <- lmFit(expr_mat, design)
fit <- eBayes(fit, trend = TRUE)

# Extract all genes (Tumor vs Normal)
# Use the coefficient name "Tumor_vs_Normal"
tt <- topTable(fit, coef = "Tumor_vs_Normal", number = Inf, sort.by = "P")

# Standardized output table
res <- data.frame(
  gene = rownames(tt),
  log2FoldChange = tt$logFC,
  AveExpr = tt$AveExpr,
  t_stat = tt$t,
  pvalue = tt$P.Value,
  padj = tt$adj.P.Val,
  B_statistic = tt$B,
  stringsAsFactors = FALSE
)

# Output directory under results/
deg_dir <- file.path(REPO_ROOT, "results/tables/deg")
dir.create(deg_dir, recursive = TRUE, showWarnings = FALSE)

out_csv <- file.path(deg_dir, "DEG_Results_LumA_IDC_Tumor_vs_AllNormals_limma.csv")
write.csv(res, out_csv, row.names = FALSE)

cat("Wrote DEG results:", out_csv, "\n")
cat("Significant DEGs (FDR<0.05 & |log2FC|>1):",
    sum(res$padj < 0.05 & abs(res$log2FoldChange) > 1, na.rm = TRUE), "\n")

“Zero sample variances detected, have been offset away from zero”


Wrote DEG results: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/results/tables/deg/DEG_Results_LumA_IDC_Tumor_vs_AllNormals_limma.csv 
Significant DEGs (FDR<0.05 & |log2FC|>1): 4191 


In [7]:
# Cell 5: Write GSEA ranked list (.rnk) using t-stat (Broad Institute's Standard for GSEA)

gsea_dir <- file.path(REPO_ROOT, "results/tables/gsea")
dir.create(gsea_dir, recursive = TRUE, showWarnings = FALSE)

rnk <- res[, c("gene", "t_stat")]
rnk <- rnk[!is.na(rnk$t_stat) & !is.na(rnk$gene), ]
rnk <- rnk[order(rnk$t_stat, decreasing = TRUE), ]

rnk_fp <- file.path(gsea_dir, "LumA_IDC_Tumor_vs_AllNormals.rnk")
write.table(rnk, rnk_fp, sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

cat("Wrote .rnk file:", rnk_fp, "\n")

Wrote .rnk file: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/results/tables/gsea/LumA_IDC_Tumor_vs_AllNormals.rnk 


In [8]:
sessionInfo()

R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin24.4.0
Running under: macOS Sequoia 15.6.1

Matrix products: default
BLAS:   /opt/homebrew/Cellar/openblas/0.3.30/lib/libopenblasp-r0.3.30.dylib 
LAPACK: /opt/homebrew/Cellar/r/4.5.2_1/lib/R/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] limma_3.66.0

loaded via a namespace (and not attached):
 [1] digest_0.6.39   IRdisplay_1.1   base64enc_0.1-3 fastmap_1.2.0  
 [5] splines_4.5.2   glue_1.8.0      htmltools_0.5.9 repr_1.1.7     
 [9] lifecycle_1.0.4 cli_3.6.5       vctrs_0.6.5     renv_1.1.5     
[13] pbdZMQ_0.3-14   statmod_1.5.1   compiler_4.5.2  tools_4.5.2    
[17] evaluate_1.0.5  pillar_1.11.1   rlang_1.1.6     jsonlite_2.0.0 
[21] crayon_1.5.3    IRkernel_1.3.2  uuid_1