In [14]:
# Cell 1: Setup and Library Installation

# Create R library in folder
dir.create("~/BIOL616-FinalProject-Repo/Rlibs", showWarnings = FALSE)

# Set library path so R installs/loading goes to ~/Rlibs
.libPaths(c("~/BIOL616-FinalProject-Repo/Rlibs", .libPaths()))

# Install BiocManager 
if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager", lib = "~/BIOL616-FinalProject-Repo/Rlibs")
}

# Install limma via Bioconductor
BiocManager::install("limma", lib = "~/BIOL616-FinalProject-Repo/Rlibs")

# Load limma from ~/BIOL616-FinalProject-Repo/Rlibs
library(limma, lib.loc = "~/BIOL616-FinalProject-Repo/Rlibs")

cat("limma successfully installed and loaded from ~/BIOL616-FinalProject-Repo/Rlibs\n")

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.r-project.org

Bioconductor version 3.16 (BiocManager 1.30.25), R 4.2.0 (2022-04-22)

“package(s) not installed when version(s) same as or greater than current; use
  `force = TRUE` to re-install: 'limma'”


✓ limma successfully installed and loaded from ~/BIOL616-FinalProject-Repo/Rlibs


In [15]:
# Cell 2: Define Paths and Load Data

suppressPackageStartupMessages({
    library(limma)
})

# Paths to new files
meta_fp <- "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/preprocessing_outputs/metadata_LumA_IDC_Tumor_vs_AllNormals.tsv"
expr_fp <- "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/preprocessing_outputs/expr_LumA_IDC_Tumor_vs_AllNormals.tsv"

# Read metadata 
meta <- read.csv(meta_fp, header = TRUE, sep = ",", stringsAsFactors = FALSE)

# Read expression matrix 
expr <- read.delim(expr_fp, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)

cat("Metadata:", nrow(meta), "samples ×", ncol(meta), "columns\n")
cat("Expression:", nrow(expr), "genes ×", ncol(expr), "samples\n")

Metadata: 413 samples × 5 columns
Expression: 20530 genes × 413 samples


In [16]:
# Cell 3: Data Pre-processing for limma

# Ensure metadata sample ordering matches expression columns
if (!all(meta$Sample %in% colnames(expr))) {
    stop("Some metadata samples are NOT in the expression matrix.")
}

expr <- expr[, meta$Sample]     # Reorder columns to match metadata order

# Build design matrix (Normal = reference)
# The comparison is Tumor vs. Normal (Normal is set as the reference/intercept).
group <- factor(meta$Group, levels = c("Normal", "Tumor"))
design <- model.matrix(~ group)

cat("Design matrix columns:\n")
print(colnames(design))

Design matrix columns:
[1] "(Intercept)" "groupTumor" 


In [17]:
# Cell 4: Run the limma-trend workflow

# Convert expression data to a matrix
expr_mat <- as.matrix(expr)  # genes × samples

# Fit the linear model to the expression data
fit <- lmFit(expr_mat, design)

# Apply empirical Bayes smoothing, using trend=TRUE for RNA-seq data
fit <- eBayes(fit, trend = TRUE)

“Zero sample variances detected, have been offset away from zero”


In [18]:
# Cell 5: Extract and Save DEG Results

# Extract all genes results for the 'groupTumor' coefficient (logFC Tumor/Normal)
tt <- topTable(fit, coef = "groupTumor", number = Inf, sort.by = "P")

# Save results (base R data.frame)
res <- data.frame(
    gene = rownames(tt),
    log2FoldChange = tt$logFC,
    AveExpr = tt$AveExpr,
    t_stat = tt$t,
    pvalue = tt$P.Value,
    padj = tt$adj.P.Val,
    B_statistic = tt$B,
    stringsAsFactors = FALSE
)

out_csv <- "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/DEG_output/DEG_Results_LumA_IDC_Tumor_vs_AllNormals.csv"

write.csv(res, out_csv, row.names = FALSE)

cat("Wrote:", normalizePath(out_csv), "\n")
print(head(res, 10))

Wrote: /data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/outputs/DEG_Results_LumA_IDC_Tumor_vs_AllNormals.csv 
        gene log2FoldChange     AveExpr    t_stat        pvalue          padj
1    COL10A1       7.688791  3.24188312  48.16521 2.218788e-171 4.555172e-167
2      MMP11       6.376813  1.98474750  42.58413 5.089454e-153 5.224325e-149
3     SLC6A2      -3.683062  0.11455235 -39.82296 2.319374e-143 1.587225e-139
4       RYR3      -3.871109 -0.01720238 -37.10937 1.774800e-133 9.109159e-130
5      HIF3A      -5.500721 -1.67441036 -35.57005 1.077495e-127 4.424196e-124
6  LOC284578      -3.914414 -0.33555986 -35.01625 1.395111e-125 4.773606e-122
7        DMD      -3.900682  0.19411215 -34.46836 1.781119e-123 5.223766e-120
8   C10orf90      -4.510218  0.82422592 -34.28951 8.744895e-123 2.244159e-119
9     LRRC3B      -3.773607  0.23977650 -34.20311 1.888865e-122 4.308712e-119
10     NDRG2      -2.875553  0.08472929 -33.93643 2.046810e-121 4.202100e-118
   B_statistic
1     381.

In [19]:
# Cell 6: Generate GSEA Ranking Score

# Path to your DEG results
deg_fp <- "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/DEG_output/DEG_Results_LumA_IDC_Tumor_vs_AllNormals.csv"

# Load DEGs
deg <- read.csv(deg_fp, stringsAsFactors = FALSE)

# Calculate GSEA ranking score: Score = sign(log2FC) * -log10(pvalue).
# This ranks by significance and signs by direction (up = +, down = -)
deg$score <- sign(deg$log2FoldChange) * -log10(deg$pvalue)

# Order by decreasing score
deg_ranked <- deg[order(deg$score, decreasing = TRUE), ]

In [20]:
# Cell 7: Save Ranked List for GSEA Input (.rnk file)

rnk_fp <- "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/DEG_output/LumA_IDC_Tumor_vs_AllNormals.rnk"

# Write final .rnk file: Two columns (Gene ID, Score) with no header, no quotes, and tab delimited (all required for GSEA)
write.table(
    deg_ranked[, c("gene", "score")],
    rnk_fp,
    sep = "\t",
    col.names = FALSE,
    row.names = FALSE,
    quote = FALSE
)

cat("Wrote .rnk file:", rnk_fp, "\n")

Wrote .rnk file: /data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/outputs/LumA_IDC_Tumor_vs_AllNormals.rnk 
