In [1]:
# Cell 1: Setup (paths + libraries)

find_repo_root <- function(start = getwd()) {
  p <- normalizePath(start)
  for (i in 1:10) {
    if (file.exists(file.path(p, "README.md")) &&
        dir.exists(file.path(p, "scripts")) &&
        dir.exists(file.path(p, "data"))) {
      return(p)
    }
    parent <- dirname(p)
    if (parent == p) break
    p <- parent
  }
  stop("Could not find repo root (expected README.md + scripts/ + data/).")
}

REPO_ROOT <- find_repo_root()
RLIBS <- file.path(REPO_ROOT, "Rlibs")
dir.create(RLIBS, showWarnings = FALSE, recursive = TRUE)

.libPaths(c(RLIBS, .libPaths()))

if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager", lib = RLIBS)
}

# Force Bioconductor repositories to avoid the repos warning
suppressMessages({
  options(repos = BiocManager::repositories())
})

cat("Repo root:", REPO_ROOT, "\n")
cat("R library path:", RLIBS, "\n")
cat("libPaths:\n"); print(.libPaths())

if (!requireNamespace("limma", quietly = TRUE)) {
  BiocManager::install("limma", lib = RLIBS, ask = FALSE, update = FALSE)
}

library(limma, lib.loc = RLIBS)
cat("limma loaded.\n")

Repo root: /data/Bio2025/Thomas/BIOL616-FinalProject-Repo 
R library path: /data/Bio2025/Thomas/BIOL616-FinalProject-Repo/Rlibs 
libPaths:
[1] "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/Rlibs"     
[2] "/data/Bio2025/Thomas/R/x86_64-conda-linux-gnu-library/4.2"
[3] "/home/biou20/miniconda3/lib/R/library"                    
limma loaded.


In [2]:
# Cell 2: Define Paths and Load Data

options(warnPartialMatchDollar = TRUE)

meta_fp <- file.path(REPO_ROOT, "data/processed/preprocessing_outputs/metadata_LumA_IDC_Tumor_vs_AllNormals.tsv")
expr_fp <- file.path(REPO_ROOT, "data/processed/preprocessing_outputs/expr_LumA_IDC_Tumor_vs_AllNormals.tsv")

cat("meta_fp:", meta_fp, "\n")
cat("expr_fp:", expr_fp, "\n")
cat("meta exists:", file.exists(meta_fp), "\n")
cat("expr exists:", file.exists(expr_fp), "\n")

meta <- read.delim(meta_fp, stringsAsFactors = FALSE, check.names = FALSE)
expr <- read.delim(expr_fp, row.names = 1, check.names = FALSE)

print(colnames(meta))
cat("Metadata:", nrow(meta), "×", ncol(meta), "\n")
cat("Expression:", nrow(expr), "×", ncol(expr), "\n")
print(table(meta$Group))

stopifnot("Sample" %in% colnames(meta))
stopifnot(all(colnames(expr) %in% meta$Sample))

meta <- meta[match(colnames(expr), meta$Sample), ]
stopifnot(all(meta$Sample == colnames(expr)))

meta_fp: /data/Bio2025/Thomas/BIOL616-FinalProject-Repo/data/processed/preprocessing_outputs/metadata_LumA_IDC_Tumor_vs_AllNormals.tsv 
expr_fp: /data/Bio2025/Thomas/BIOL616-FinalProject-Repo/data/processed/preprocessing_outputs/expr_LumA_IDC_Tumor_vs_AllNormals.tsv 
meta exists: TRUE 
expr exists: TRUE 
[1] "Sample"            "Group"             "molecular_subtype"
[4] "sample_type"       "histological_type"
Metadata: 413 × 5 
Expression: 20530 × 413 

Normal  Tumor 
   114    299 


In [3]:
# Cell 3: Data Pre-processing for limma (strict alignment)

# 1) Verify exact overlap
missing_in_expr <- setdiff(meta$Sample, colnames(expr))
missing_in_meta <- setdiff(colnames(expr), meta$Sample)

if (length(missing_in_expr) > 0) {
  cat("Samples in meta but missing in expr:", length(missing_in_expr), "\n")
  print(head(missing_in_expr, 10))
  stop("Metadata contains samples not present in expression matrix.")
}

if (length(missing_in_meta) > 0) {
  cat("Samples in expr but missing in meta:", length(missing_in_meta), "\n")
  print(head(missing_in_meta, 10))
  stop("Expression matrix contains samples not present in metadata.")
}

# 2) Reorder expression columns to match metadata order exactly
expr <- expr[, meta$Sample, drop = FALSE]

# 3) Confirm exact alignment
stopifnot(all(colnames(expr) == meta$Sample))

# 4) Build design matrix (Normal = reference)
group <- factor(meta$Group, levels = c("Normal", "Tumor"))
design <- model.matrix(~ group)
colnames(design) <- c("Intercept", "Tumor_vs_Normal")

cat("Design matrix columns:\n")
print(colnames(design))
print(table(group))

Design matrix columns:
[1] "Intercept"       "Tumor_vs_Normal"
group
Normal  Tumor 
   114    299 


In [4]:
# Cell 4: limma-trend + save DEG results

suppressPackageStartupMessages({
  library(limma)
})

# Ensure matrix
expr_mat <- as.matrix(expr)

# Fit model
fit <- lmFit(expr_mat, design)
fit <- eBayes(fit, trend = TRUE)

# Extract all genes (Tumor vs Normal)
# Use the coefficient name "Tumor_vs_Normal"
tt <- topTable(fit, coef = "Tumor_vs_Normal", number = Inf, sort.by = "P")

# Standardized output table
res <- data.frame(
  gene = rownames(tt),
  log2FoldChange = tt$logFC,
  AveExpr = tt$AveExpr,
  t_stat = tt$t,
  pvalue = tt$P.Value,
  padj = tt$adj.P.Val,
  B_statistic = tt$B,
  stringsAsFactors = FALSE
)

# Output directory under results/
deg_dir <- file.path(REPO_ROOT, "results/tables/deg")
dir.create(deg_dir, recursive = TRUE, showWarnings = FALSE)

out_csv <- file.path(deg_dir, "DEG_Results_LumA_IDC_Tumor_vs_AllNormals_limma.csv")
write.csv(res, out_csv, row.names = FALSE)

cat("Wrote DEG results:", out_csv, "\n")
cat("Significant DEGs (FDR<0.05 & |log2FC|>1):",
    sum(res$padj < 0.05 & abs(res$log2FoldChange) > 1, na.rm = TRUE), "\n")

“Zero sample variances detected, have been offset away from zero”


Wrote DEG results: /data/Bio2025/Thomas/BIOL616-FinalProject-Repo/results/tables/deg/DEG_Results_LumA_IDC_Tumor_vs_AllNormals_limma.csv 
Significant DEGs (FDR<0.05 & |log2FC|>1): 4191 


In [5]:
# Cell 5: Write GSEA ranked list (.rnk) using t-stat (Broad Institute's Standard for GSEA)

gsea_dir <- file.path(REPO_ROOT, "results/tables/gsea")
dir.create(gsea_dir, recursive = TRUE, showWarnings = FALSE)

rnk <- res[, c("gene", "t_stat")]
rnk <- rnk[!is.na(rnk$t_stat) & !is.na(rnk$gene), ]
rnk <- rnk[order(rnk$t_stat, decreasing = TRUE), ]

rnk_fp <- file.path(gsea_dir, "LumA_IDC_Tumor_vs_AllNormals.rnk")
write.table(rnk, rnk_fp, sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

cat("Wrote .rnk file:", rnk_fp, "\n")

Wrote .rnk file: /data/Bio2025/Thomas/BIOL616-FinalProject-Repo/results/tables/gsea/LumA_IDC_Tumor_vs_AllNormals.rnk 


In [6]:
sessionInfo()

R version 4.2.0 (2022-04-22)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/biou20/miniconda3/lib/libopenblasp-r0.3.25.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] limma_3.54.2

loaded via a namespace (and not attached):
 [1] digest_0.6.37       crayon_1.5.3        IRdisplay_1.1      
 [4] repr_1.1.7          lifecycle_1.0.4     jsonlite_1.8.9     
 [7] evaluate_1.0.1      pillar_1.11.1       rlang_1.1.6        
[10] cli_3.6.3           uuid_1.2-1          vctrs_0.6.5    