In [3]:
#This block to load all libraries needed
library("DEXSeq")
library("tidyr")
library("qvalue")
library("edgeR")
library("dplyr")
library("reshape2")
library("EnhancedVolcano")
library("sva")
library("Rtsne")
library("ggplot2")
library("pheatmap")
library("org.Hs.eg.db")
library("readr")
library('scales')

In [2]:
##GFF
flattenedFile = list.files('.', pattern="gff$", full.names=TRUE)

#METADATA
metadata = read.csv('ExCy_metadata_dexseq.csv')
rownames(metadata) <- metadata$SampleID
metadata <- metadata[,-1]

#Entire dataset generation here to do downstream analyses
All.countfiles=list.files('./ALL/', pattern="txt$", full.names=TRUE)

##Generate count file
All.dxd = DEXSeqDataSetFromHTSeq(countfiles = All.countfiles, sampleData = metadata[c(1:28),], design = ~ sample + exon, flattenedfile = flattenedFile )
All.count <- as.data.frame(featureCounts(All.dxd))

#Do the light filtering and aggregation
All.count$ID <- rownames(All.count)
All.count <- All.count %>% mutate(Gene = substr(ID, 1,15))
All.cf <- All.count %>% mutate(row_sum = rowSums(.[,-29:-30])) %>% dplyr::filter(row_sum > 10) %>% select(-ID)
All.cfs <-  melt(All.cf, id.vars = "Gene") %>% group_by(Gene, variable) %>% summarise(sum_value=sum(value)) %>% dcast(Gene ~ variable, value.var="sum_value")
rownames(All.cfs) <- All.cfs$Gene


#prepare for annotation
exomega <- as.data.frame(read.csv('ExoMEGA.txt'))
mRNA <- read_tsv('VESICLEPEDIA_PROTEIN_MRNAS_5.1.txt')

##map gene names to HUGO
annot.df <- mapIds(org.Hs.eg.db, keys = row.names(All.cfs), column = "SYMBOL", keytype = "ENSEMBL") %>% as.data.frame() %>% rename(SYMBOL=1)
annot.df$SYMBOL <- toupper(annot.df$SYMBOL)
All.cfs$gene_id <- annot.df$SYMBOL 

#filter out non-mapped genes and combine duplicate
All.gcfs <- All.cfs %>% dplyr::filter(!grepl("NA", gene_id, fixed= TRUE)) %>% select(-Gene)
All.gpcfs <-  melt(All.gcfs, id.vars = "gene_id") %>% group_by(gene_id, variable) %>% summarise(sum_value=sum(value)) %>% dcast(gene_id ~ variable, value.var="sum_value")
All.gpcfs <- All.gpcfs[complete.cases(All.gpcfs), ]


All.exomega <- as.data.frame(All.gpcfs[All.gpcfs$gene_id %in% exomega[,1] ,])
rownames(All.exomega) <- All.exomega$gene_id
mRNA_All <- All.exomega[rownames(All.exomega) %in% mRNA$mRNA ,]
All.exomega <- mRNA_All[,2:29]

#write.csv(All.gpcfs, 'TOTAL_RNA.csv')

####################################################

#ALL

#edgeR
vol.metadata <- metadata
status <- relevel(factor(vol.metadata$Status), ref="Healthy")
design <- model.matrix(~status)
d <- DGEList(counts = All.exomega)
d <- calcNormFactors(d)
d <- estimateDisp(d, design, robust=TRUE)
d_fit <- glmFit(d, design)
qlf <- glmLRT(d_fit)
qlf_table <- topTags(qlf, n=Inf)$table
write.csv(qlf_table, 'All_Tumor_vs_healthy.csv')
#head(qlf_table)


####################################################

#ExCy
vol.metadata <- metadata[c(1,5,9,13,17,21,25),]
status <- relevel(factor(vol.metadata$Status), ref="Healthy")
design <- model.matrix(~status)
d <- DGEList(counts = All.exomega[,c(1,5,9,13,17,21,25)])
d <- calcNormFactors(d)
d <- estimateDisp(d, design, robust=TRUE)
d_fit <- glmFit(d, design)
qlf <- glmLRT(d_fit)
qlf_table <- topTags(qlf, n=Inf)$table
write.csv(qlf_table, 'ExCy_Tumor_vs_healthy.csv')



####################################################

#ExoEasy
vol.metadata <- metadata[c(2,6,10,14,18,22,26),]
status <- relevel(factor(vol.metadata$Status), ref="Healthy")
design <- model.matrix(~status)
d <- DGEList(counts = All.exomega[,c(2,6,10,14,18,22,26)])
d <- calcNormFactors(d)
d <- estimateDisp(d, design, robust=TRUE)
d_fit <- glmFit(d, design)
qlf <- glmLRT(d_fit)
qlf_table <- topTags(qlf, n=Inf)$table
write.csv(qlf_table, 'EE_Tumor_vs_healthy.csv')

####################################################

#FF
vol.metadata <- metadata[c(3,7,11,15,19,23,27),]
status <- relevel(factor(vol.metadata$Status), ref="Healthy")
design <- model.matrix(~status)
d <- DGEList(counts = All.exomega[,c(3,7,11,15,19,23,27)])
d <- calcNormFactors(d)
d <- estimateDisp(d, design, robust=TRUE)
d_fit <- glmFit(d, design)
qlf <- glmLRT(d_fit)
qlf_table <- topTags(qlf, n=Inf)$table
write.csv(qlf_table, 'FF_Tumor_vs_healthy.csv')

####################################################

#UC
vol.metadata <- metadata[c(4,8,12,16,20,24,28),]
status <- relevel(factor(vol.metadata$Status), ref="Healthy")
design <- model.matrix(~status)
d <- DGEList(counts = All.exomega[,c(4,8,12,16,20,24,28)])
d <- calcNormFactors(d)
d <- estimateDisp(d, design, robust=TRUE)
d_fit <- glmFit(d, design)
qlf <- glmLRT(d_fit)
qlf_table <- topTags(qlf, n=Inf)$table
write.csv(qlf_table, 'UC_Tumor_vs_healthy.csv')

####################################################

#Summary of results down below. We expand the fdr to 0.10 since small dataset. 

#ALL - MOG, FBL

#ExoEasy - ZFPL1, NDUFB11, 

#FF -  NT5C, RASF7, BAG6, COX6A2, ACR, IGKV1D-33

#ExCy - ATP6V0B

#UC - PRPS2, ARHGDIA, ERAS, LILRB3, S100A8, TMC6, NIT1

converting counts to integer mode

“some variables in design formula are characters, converting to factors”
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as the

In [11]:
##GFF
flattenedFile = list.files('.', pattern="gff$", full.names=TRUE)

#METADATA
metadata = read.csv('ExCy_metadata_dexseq.csv')
rownames(metadata) <- metadata$SampleID
metadata <- metadata[,-1]

#Entire dataset generation here to do downstream analyses
All.countfiles=list.files('./ALL/', pattern="txt$", full.names=TRUE)

##Generate count file
All.dxd = DEXSeqDataSetFromHTSeq(countfiles = All.countfiles, sampleData = metadata[c(1:28),], design = ~ sample + exon, flattenedfile = flattenedFile )
All.count <- as.data.frame(featureCounts(All.dxd))

#Do the light filtering and aggregation
All.count$ID <- rownames(All.count)
All.count <- All.count %>% mutate(Gene = substr(ID, 1,15))
All.cf <- All.count %>% mutate(row_sum = rowSums(.[,-29:-30])) %>% dplyr::filter(row_sum > 10) %>% select(-ID)
All.cfs <-  melt(All.cf, id.vars = "Gene") %>% group_by(Gene, variable) %>% summarise(sum_value=sum(value)) %>% dcast(Gene ~ variable, value.var="sum_value")
rownames(All.cfs) <- All.cfs$Gene


#prepare for annotation
exomega <- as.data.frame(read.csv('ExoMEGA.txt'))
mRNA <- read_tsv('VESICLEPEDIA_PROTEIN_MRNAS_5.1.txt')

##map gene names to HUGO
annot.df <- mapIds(org.Hs.eg.db, keys = row.names(All.cfs), column = "SYMBOL", keytype = "ENSEMBL") %>% as.data.frame() %>% rename(SYMBOL=1)
annot.df$SYMBOL <- toupper(annot.df$SYMBOL)
All.cfs$gene_id <- annot.df$SYMBOL 

#filter out non-mapped genes and combine duplicate
All.gcfs <- All.cfs %>% dplyr::filter(!grepl("NA", gene_id, fixed= TRUE)) %>% select(-Gene)
All.gpcfs <-  melt(All.gcfs, id.vars = "gene_id") %>% group_by(gene_id, variable) %>% summarise(sum_value=sum(value)) %>% dcast(gene_id ~ variable, value.var="sum_value")
All.gpcfs <- All.gpcfs[complete.cases(All.gpcfs), ]


All.exomega <- as.data.frame(All.gpcfs[All.gpcfs$gene_id %in% exomega[,1] ,])
rownames(All.exomega) <- All.exomega$gene_id
mRNA_All <- All.exomega[rownames(All.exomega) %in% mRNA$mRNA ,]
All.exomega <- mRNA_All[,2:29]
#ExCy
vol.metadata <- metadata[c(1,5,9,13,17,21,25),]
status <- relevel(factor(vol.metadata$Status), ref="Healthy")
design <- model.matrix(~status)
d <- DGEList(counts = All.exomega[,c(1,5,9,13,17,21,25)])
d <- calcNormFactors(d)
d <- estimateDisp(d, design, robust=TRUE)
d_fit <- glmFit(d, design)
qlf <- glmLRT(d_fit)
qlf_table <- topTags(qlf, n=Inf)$table
head(qlf_table)


converting counts to integer mode

“some variables in design formula are characters, converting to factors”
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as the

Unnamed: 0_level_0,logFC,logCPM,LR,PValue,FDR
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ATP6V0B,2.024615,6.569898,19.15903,1.202674e-05,0.09608159
ICOSLG,4.480779,3.101811,16.6981,4.382494e-05,0.17505872
IRF7,8.026398,2.894858,14.90488,0.00011307,0.22986096
HRK,1.600193,6.807256,14.24983,0.0001600749,0.22986096
PSMB4,-1.947523,6.047434,13.87947,0.0001949164,0.22986096
MPHOSPH10,-1.427288,6.65063,13.62371,0.0002233463,0.22986096
