<a href="https://colab.research.google.com/github/tgstoecker/teaching/blob/master/AppliedBioinformatics/Notebooks/preMapman2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
R.Version()

install.packages("BiocManager", verbose = TRUE)
BiocManager::install(ask = FALSE)
BiocManager::install("limma")
BiocManager::install("edgeR")

install.packages("reshape2", verbose = TRUE)
install.packages("statmod", verbose = TRUE)

In [None]:
## load all required packages
library(reshape2)
library(statmod)
library(limma)
library(edgeR)

In [None]:
##Let's create the input file for MapMan we are going to use later - pathway visualization and inspection:
#We quickly go through the edgeR workflow with the transcript level data 

# read in the results - the TRANSCRIPT!! count matrix you generated using featureCounts; adapt the file path to your directory structure
# row.names = 1 results in TranscriptIDs as rownames
counts <- "https://raw.githubusercontent.com/tgstoecker/teaching/master/AppliedBioinformatics/B73/transcript-level/total_file.count"
fc_res_transcripts <- read.table(counts, header = T, row.names = 1)

In [None]:
# shorten the column names indicating the samples - e.g.:
colnames(fc_res_transcripts) <- sub("_trimmed_sorted.bam", "", colnames(fc_res_transcripts))

In [None]:
# shorten the column names indicating the samples - e.g.:
colnames(fc_res_transcripts) <- sub("_trimmed_sorted.bam", "", colnames(fc_res_transcripts))

# create a vector indicating treatment conditions of the samples - logic: columns left to right
group = c("control", "control", "control", "control", "drought", "drought", "drought", "drought")

# create a DGE list object
dge = DGEList(counts = fc_res_transcripts[, 6:13], group = group, genes = rownames(fc_res_transcripts))

#Create design model
design <- model.matrix(~0+group)

#Perform the filtering
keep <- filterByExpr(dge, design)
dge_filtered <- dge[keep, , keep.lib.sizes=FALSE]

#Perform normalization
dge_normalized <- calcNormFactors(dge_filtered, method = "TMM")

#Perform dispersion estimation
dge_disp <- estimateDisp(dge_normalized, design, robust=TRUE)

#Estimation of QL dispersions
fit <- glmQLFit(dge_disp, design, robust=TRUE)

#Create contrasts
CvsD <- makeContrasts(groupdrought-groupcontrol, levels=design)

#Perform test
res <- glmQLFTest(fit, contrast=CvsD)

##Please feel free to investigate the transcript level data

#The first thing to check is whether or not our data contains infinite fold changes.
#This shouldn't be the case because of the way edgeR works - however this is an important detail to keep in mind as MapMan doesn't know what to do with them.
#In other pipelines it might be necessary to transform infinite values to a common number.

#Create an easy to view intermediate file - then inspect it (infinite fold changes?)
MapMan_check <- topTags(res, adjust.method = "BH", n = "all")
#View(MapMan_check$table)

#Create a subset containing only three columns: Gene ID, log2 fold change and q-value/FDR correction;
#while we're at it, let's keep the column names simple
MapMan_input_B73 <- data.frame(MapMan_check$table$genes, MapMan_check$table$logFC, MapMan_check$table$FDR)
colnames(MapMan_input_B73) <- c("transcript_id", "log2_fold_change", "q_value")

In [None]:
#Write the data to a file - check and choose your working directory and file names - this file will be our input for MapMan
write.table(MapMan_input_B73[,c("transcript_id", "log2_fold_change", "q_value")], file="/b73_mapman.txt", sep="\t", col.names = T, row.names = F)