Skip to content

Commit

Permalink
Merge pull request #10 from virtual-labs/testing
Browse files Browse the repository at this point in the history
Testing
  • Loading branch information
amritamindbrain committed Sep 11, 2021
2 parents c3234e0 + 672fce5 commit d1d06f5
Show file tree
Hide file tree
Showing 4 changed files with 367 additions and 11 deletions.
253 changes: 253 additions & 0 deletions experiment/simulation/RNA_Seq_sample_code.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@


# Install and load packages

if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")

BiocManager::install("DESeq2")
BiocManager::install("edgeR")
BiocManager::install("biomaRt")
BiocManager::install('PCAtools')

library(edgeR)
library(DESeq2)
library("biomaRt")

###################### load the raw count matrix #######################

setwd("wrkng_directory/")

rawcount<-read.table ("your_Rawgenedata.tsv",header=TRUE, sep="\t", row.names=1)

###################### Filter for coding genes ########################

mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
all_coding_genes <- getBM(attributes = c( "hgnc_symbol"), filters = c("biotype"), values = list(biotype="protein_coding"), mart = mart)
rawcount <- rawcount[row.names(rawcount) %in% all_coding_genes$hgnc_symbol,]

###################### Filter low count gene #########################

keep <- rowSums(cpm(rawcount)>1) >= 5 ## depends case to case and on the number of samples
rawcount<- rawcount[keep,]

## if you not sure about the threshold, there are automatics calculation for the threshold. Please refer to the following link
## https://seqqc.wordpress.com/2020/02/17/removing-low-count-genes-for-rna-seq-downstream-analysis/
## I am keeping this script simple as much as possible. But you can check the relevant function at the end of this script.

###################### Data annotation #################################

anno <-read.table ("Your_Annotation_of_samples.csv",header=TRUE, sep=",") ##In this case Two coulmns (a) sample (b) Condition
rownames(anno) <- anno$sample

# Define conditions (for contrast) that you want to compare if you have more than one #control #case
# This is pair-wise comparison, so only consider one pair at one time

firstC<-"case1" #case1 #case2 #case3 etc
SecondC <-"Control"
p.threshold <- 0.05 ##define threshold for filtering


### subset raw and conditional data for defined pairs

anno <- anno[(anno$Condition ==firstC |anno$Condition ==SecondC),]
rawcount <- rawcount[,names(rawcount) %in% anno$sample]

############################### Create DESeq2 datasets #############################

dds <- DESeqDataSetFromMatrix(countData = rawcount, colData = anno, design = ~Condition )

## Run DESEQ2
dds <- DESeq(dds)

################# contrast based comparison ##########################

#In case of multiple comparisons ## we need to change the contrast for every comparision
contrast<- c("Condition",firstC,SecondC)

res <- results(dds, contrast=contrast)
res$threshold <- as.logical(res$padj < p.threshold) #Threshold defined earlier

nam <- paste('down_in',firstC, sep = '_')
#res$nam <- as.logical(res$log2FoldChange < 0)
res[, nam] <- as.logical(res$log2FoldChange < 0)

genes.deseq <- row.names(res)[which(res$threshold)]

genes_deseq2_sig <- res[which(res$threshold),]

file <- paste('Deseq2_',firstC,'_v_',SecondC,'_results_significant_padj0.05.csv',sep = '')
all_results <- paste('Deseq2_',firstC,'_v_',SecondC,'_all_results.csv',sep = '')

write.table(genes_deseq2_sig,file,sep = ",")
write.table(res,all_results,sep = ",")

################### PCA and Heat-MAp Plots ############################

## Varinace transformation vst or rlog
vsd <- vst(dds, blind=FALSE) #Variance type (a) Vst or (b) rlog
#rld <- rlog(dds, blind=FALSE)

###### PCA with design consideration ###
pcaData <- plotPCA(vsd, intgroup=c("Condition", "sample"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=sample, shape=Condition)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()

##heatmap
sampleDists <- dist(t(assay(vsd)))
library("RColorBrewer")
library('pheatmap')
sampleDistMatrix <- as.matrix(sampleDists)

rownames(sampleDistMatrix) <- paste(vsd$Condition, vsd$sample, sep="-")

colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)


############ Quick enrichment analysis ##################
BiocManager::install("ReactomePA")
library(ReactomePA)

all <- genes.deseq ## retreive EntrezGene id's

genes=getBM(attributes = c("hgnc_symbol", "entrezgene_id"), filters = "hgnc_symbol", values = all, bmHeader = T, mart = mart)

genes1 <- genes$`NCBI gene (formerly Entrezgene) ID`



#?enrichPathway #pvalueCutoff=0.02, #pAdjustMethod = "BH", qvalueCutoff = 0.01,
x <- enrichPathway(gene=genes1, pvalueCutoff=0.05,readable=T)

#head(as.data.frame(x))
barplot(x, showCategory=10)
dotplot(x, showCategory=10)
emapplot(x)
cnetplot(x, categorySize="pvalue", foldChange=genes1)
emapplot(x, color="pvalue")
viewPathway("Extracellular matrix organization", readable=TRUE, foldChange=genes1) ## it's an example


############## edgeR ##########################


dge <- DGEList(counts=rawcount, group=anno$Condition)

# Normalize by total count
dge <- calcNormFactors(dge, method = "TMM")

## PCA ## for more details, please visit following link
##https://bioconductor.org/packages/release/bioc/vignettes/PCAtools/inst/doc/PCAtools.html
library(PCAtools)

cpmlog <- cpm(dge, log = TRUE, prior.count = 1)

p <-pca(cpmlog, metadata = anno, removeVar = 0.2) ## -- removing the lower 20% of variables based on variance
#biplot(p)
plotloadings(p)

biplot(p,
lab = paste0(p$metadata$sample),
colby = 'Condition',
hline = 0, vline = 0,
legendPosition = 'right')




# Create the contrast matrix
design.mat <- model.matrix(~ 0 + dge$samples$group)
colnames(design.mat) <- levels(dge$samples$group)
design.mat

# Estimate dispersion parameter for GLM
dge <- estimateGLMCommonDisp(dge, design.mat)
dge <- estimateGLMTrendedDisp(dge, design.mat)
dge<- estimateGLMTagwiseDisp(dge,design.mat)
# Plot mean-variance
#plotBCV(dge)
# Model fitting

fit.edgeR <- glmFit(dge, design.mat)

# Differential expression

contrasts.edgeR <- makeContrasts(case1 - Control, levels=design.mat) ##FirstC-SecondC ##Define


lrt.edgeR <- glmLRT(fit.edgeR, contrast=contrasts.edgeR)

##### DGE at padjust 0.05

# Access results tables
edgeR_results <- lrt.edgeR$table
sig.edgeR <- decideTestsDGE(lrt.edgeR, adjust.method="BH", p.value = p.threshold)
#View(sig.edgeR)
significant_table <- edgeR_results[which(sig.edgeR != 0),]
significant_table$gene <- row.names(significant_table)
genes.edgeR <- row.names(edgeR_results)[which(sig.edgeR != 0)]

edgeR_results$genes <- row.names(edgeR_results)


file_sigTab <- paste('edgeR_',firstC,'_v_',SecondC,'_results_significant_padj0.05.csv',sep = '')
file_allRes <- paste('edgeR_',firstC,'_v_',SecondC,'_all_results.csv',sep = '')

write.table(significant_table,file_sigTab,sep = ",")
write.table(edgeR_results,file_allRes,sep = ",")


################# Overlapped genes between deseq2 and edgeR ##########

library(gplots)

venn(list(edgeR = genes.edgeR, DESeq2 = genes.deseq))
overlapped_genes <- intersect(genes.deseq,genes.edgeR)


file_common <- paste('Common_DEG_deseq2_edgeR_',firstC,'_v_',SecondC,'.csv',sep = '')
write.table(overlapped_genes,file_common,sep = ",", row.names = F)

## Save session info
sessionInfo()
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")


########## Automate Function for filtering the low count read ###########
## adopted from ## https://seqqc.wordpress.com/2020/02/17/removing-low-count-genes-for-rna-seq-downstream-analysis/

selectGenes <- function(counts, min.count=10, N=0.90){

lib.size <- colSums(counts)
MedianLibSize <- median(lib.size)
CPM.Cutoff <- min.count / MedianLibSize*1e6
CPM <- edgeR::cpm(counts,lib.size=lib.size)

min.samples <- round(N * ncol(counts))

f1 <- genefilter::kOverA(min.samples, CPM.Cutoff)
flist <- genefilter::filterfun(f1)
keep <- genefilter::genefilter(CPM, flist)

## the same as:
#keep <- apply(CPM, 1, function(x, n = min.samples){
# t = sum(x >= CPM.Cutoff) >= n
# t
#})

return(keep)
}

keep.exprs <- selectGenes(assay(x), min.count=10, N=0.90)
myFilt <- x[keep.exprs,]
dim(myFilt)
###########################################################################
Binary file added experiment/simulation/images/Amritalogo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added experiment/simulation/images/vlablogo2.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
125 changes: 114 additions & 11 deletions experiment/simulation/index.html
Original file line number Diff line number Diff line change
@@ -1,13 +1,116 @@
<!DOCTYPE html>
<html>
<head>
<!-- Add CSS at the head of HTML file -->
<link rel="stylesheet" href="./css/main.css">
</head>
<body>
<!-- Your code goes here-->

<!-- Add JS at the bottom of HTML file -->
<script src="./js/main.js"></script>
</body>
<head>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous">

</head>
<title>
Vlab experiment GC count
</title>
<body>

<div class="card-header" style="max-width: 70rem; background: rgb(241, 239, 239);">
<div class="row">

<div class="col-1" style="left: 10px; text-align: center;">
<a href="https://www.amrita.edu/center/mind-brain" target="_blank">
<img alt="amritalogo" src="./images/Amritalogo.png" style="width:300px;height:90px;">
</a>
</div>

<div class="col-2" style="border-right: thick solid #d8d8d8ee; margin-left: 80px;"></div>

<div class="col-6" style="margin-left: 40px;">
<!--h2><b>Simulator</b></h1-->
</br>
<h5 class="card-subtitle mb-2 text-muted"><b><i>Differential expression analyses of RNA-seq</i></b></h2>
</div>

</div>

</div>

<div class="card-header" style="max-width: 70rem; background: rgb(253, 253, 253);">
<form style="padding: 150px;">

<div class="form-group">

<p>Download the sample R code and run in your local system</p>
<br>
<p>Need to install the given below <span><b>Bioconductor</b></span> R packages in your R </p>

<br>

<code>
if (!requireNamespace("BiocManager", quietly = TRUE))<br>
install.packages("BiocManager")<br><br>

BiocManager::install("DESeq2")<br>
BiocManager::install("edgeR")<br>
BiocManager::install("biomaRt")<br>
BiocManager::install('PCAtools')<br>

</code>

<br>

<a href="./RNA_Seq_sample_code.R" download="RNA_Seq_sample_code.R" > <button class="btn btn-success" type="button"> Sample Code </button> </a>

<br>
<br>

<output id="GCValueout">

</output>

</div>

</form>
</div>

<script>

//finding the GC and AT Content of a DNA

function FindGC() {

let dna_seq = document.getElementById('dnainput').value.toUpperCase();

if(dna_seq.match(/^\s+$/) || dna_seq == ""){

alert("Please enter a DNA Sequence");
return
}

let a = count(dna_seq, "A")
let c = count(dna_seq, "C")
let g = count(dna_seq, "G")
let t = count(dna_seq, "T")

let GC_content = ((g + c) / (g + c + a + t) * 100).toFixed(2) + "%";

if (dna_seq == "") {

document.getElementById('dnainput').style.border = "1px solid red";
document.getElementById('GCValueout').innerHTML = "";

} else {
document.getElementById('GCValueout').innerHTML = "GC Content: " + GC_content;
}
}

function count(str, letter) {
var letter_Count = 0;
for (var position = 0; position < str.length; position++) {

if (str.charAt(position) == letter) {
letter_Count++;
}

}
return letter_Count;
}

</script>

</body>
</html>

0 comments on commit d1d06f5

Please sign in to comment.