<a href="https://colab.research.google.com/github/thevirusoup/thevirusoup/blob/main/RNA_Seq_Analysis_by_DESeq2_in_R_Studio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ------------------------------
# 1️⃣ Load packages
# ------------------------------
library(DESeq2)
library(tidyverse)

# ------------------------------
# 2️⃣ Load count matrix
# ------------------------------
counts <- read.table("C:/Users/pc/Documents/Tanzil/all_counts_clean.txt",
                     header=TRUE, row.names=1)

# Check the first few rows
head(counts)
dim(counts)
colnames(counts)

# ------------------------------
# 3️⃣ Prepare sample metadata
# ------------------------------
sample_info <- data.frame(
  sample = colnames(counts),
  condition = c("N1W","N1W","N1W","N1Y")   # Updated condition for SRR14999757
)
rownames(sample_info) <- sample_info$sample

# ------------------------------
# 4️⃣ Create DESeq2 dataset
# ------------------------------
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = sample_info,
  design = ~ condition
)

# Pre-filter low counts
dds <- dds[rowSums(counts(dds)) > 1, ]

# Run DESeq2 (normalization + DE analysis)
dds <- DESeq(dds)

# ------------------------------
# 5️⃣ Extract normalized counts
# ------------------------------
normalized_counts <- counts(dds, normalized=TRUE)
head(normalized_counts)

# ------------------------------
# 6️⃣ Extract DEGs
# ------------------------------
# Compare N1W vs N1Y
res <- results(dds, contrast=c("condition","N1W","N1Y"))

# Order by adjusted p-value
res <- res[order(res$padj), ]

# Filter significant DEGs
deg_list <- subset(res, padj < 0.05 & abs(log2FoldChange) > 1)

# View top DEGs
head(deg_list)

# Save DEGs to CSV
write.csv(as.data.frame(deg_list), "DEGs_N1W_vs_N1Y.csv")


In [None]:
# Create a clean counts matrix (only gene rows, no summary lines)
paste SRR14999754_ReadsPerGene.out.tab \
      SRR14999755_ReadsPerGene.out.tab \
      SRR14999756_ReadsPerGene.out.tab \
      SRR14999757_ReadsPerGene.out.tab \
| grep -v -E 'N_unmapped|N_multimapping|N_noFeature|N_ambiguous' \
| awk 'BEGIN{OFS="\t"} {print $1,$2,$6,$10,$14}' \
> all_counts.txt

# Add header line
sed -i '1iGeneID\tSRR14999754\tSRR14999755\tSRR14999756\tSRR14999757' all_counts.txt
