In [2]:
#This block to load all libraries needed
library("DEXSeq")
library("tidyr")
library("qvalue")
library("edgeR")
library("dplyr")
library("reshape2")
library("EnhancedVolcano")
library("sva")
library("Rtsne")
library("ggplot2")
library("pheatmap")
library("org.Hs.eg.db")
library("readr")


Loading required package: BiocParallel

Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: 'BiocGenerics'


The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs


The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required packa

In [3]:
#GFF
flattenedFile = list.files('.', pattern="gff$", full.names=TRUE)

#METADATA
metadata = read.csv('ExCy_metadata_dexseq.csv')
rownames(metadata) <- metadata$SampleID
metadata <- metadata[,-1]

#Entire dataset generation here to do downstream analyses
All.countfiles=list.files('./Tumors/', pattern="txt$", full.names=TRUE)

#Generate count file
All.dxd = DEXSeqDataSetFromHTSeq(countfiles = All.countfiles, sampleData = metadata[c(1:4,13:20,25:28),], design = ~ sample + exon, flattenedfile = flattenedFile )
All.count <- as.data.frame(featureCounts(All.dxd))

#Do the light filtering and aggregation
All.count$ID <- rownames(All.count)
All.count <- All.count %>% mutate(Gene = substr(ID, 1,15))
All.cf <- All.count %>% mutate(row_sum = rowSums(.[,-17:-18])) %>% dplyr::filter(row_sum > 10) %>% select(-ID)
All.cfs <-  melt(All.cf, id.vars = "Gene") %>% group_by(Gene, variable) %>% summarise(sum_value=sum(value)) %>% dcast(Gene ~ variable, value.var="sum_value")
rownames(All.cfs) <- All.cfs$Gene

#prepare for annotation
exomega <- as.data.frame(read.csv('ExoMEGA.txt'))
mRNA <- read_tsv('VESICLEPEDIA_PROTEIN_MRNAS_5.1.txt')

##map gene names to HUGO
annot.df <- mapIds(org.Hs.eg.db, keys = row.names(All.cfs), column = "SYMBOL", keytype = "ENSEMBL") %>% as.data.frame() %>% rename(SYMBOL=1)
annot.df$SYMBOL <- toupper(annot.df$SYMBOL)
All.cfs$gene_id <- annot.df$SYMBOL 

#filter out non-mapped genes and combine duplicate
All.gcfs <- All.cfs %>% dplyr::filter(!grepl("NA", gene_id, fixed= TRUE)) %>% select(-Gene)
All.gpcfs <-  melt(All.gcfs, id.vars = "gene_id") %>% group_by(gene_id, variable) %>% summarise(sum_value=sum(value)) %>% dcast(gene_id ~ variable, value.var="sum_value")
All.gpcfs <- All.gpcfs[complete.cases(All.gpcfs), ]


All.exomega <- as.data.frame(All.gpcfs[All.gpcfs$gene_id %in% exomega[,1] ,])
rownames(All.exomega) <- All.exomega$gene_id
mRNA_All <- All.exomega[rownames(All.exomega) %in% mRNA$mRNA ,]
All.exomega <- mRNA_All[,2:17]

#write.table(All.exomega, "raw_counts_tumors.txt", sep="\t")


######################################

#edgeR
vol.metadata <- metadata[c(1:4,13:20,25:28),]
method <- relevel(factor(vol.metadata$Method), ref="UC")
#condition <- factor(vol.metadata$Condition)
#histology <- factor(vol.metadata$Histology)
#sample <- factor(vol.metadata$ID)
#pathology <- factor(vol.metadata$Pathology)
design <- model.matrix(~method)
d <- DGEList(counts = All.exomega)
d <- calcNormFactors(d)
d <- estimateDisp(d, design, robust=TRUE)
d_fit <- glmFit(d, design)
qlf <- glmLRT(d_fit, coef=2:4)
qlf_table <- topTags(qlf, n=Inf)$table

#write.csv(qlf_table, 'altogether_tumor_deg_mrna.csv')





####################################


#prepare for heatmap
All.heatmap <- All.exomega
Gene_List = rownames(All.exomega)
All.heatmap <- cbind(All.heatmap, Gene_List)
rownames(All.heatmap) <- rownames(All.exomega)
DEG <- topTags(qlf, n=Inf)$table %>% slice(1:14)

All_prep_heatmap <- as.data.frame(All.heatmap[All.heatmap$Gene_List %in% rownames(DEG), ])
All_prep_heatmap <- All_prep_heatmap[,-17] 
colnames(All_prep_heatmap) <- vol.metadata$Method
labels <- colnames(All_prep_heatmap)

colnames(All_prep_heatmap) <- c(1:16)
meta <- data.frame(Patient=list(vol.metadata$ID))
colnames(meta) <- c("PC")

ann_colors=list(PC=c(PC1="#edf8fb", PC2="#b2e2e2", PC3="#66c2a4", PC4="#2ca25f"))

#Graphing

pheatmap(
    log2(All_prep_heatmap+1),
    scale='row',
    border_color="white",
    clustering_method="ward.D2",
    fontsize=16,
    number_color="black",
    display_numbers=F,
    show_rownames=T,
    cutree_col=4,
    fontface="bold",
    angle_col=90,
    annotation_col=meta,
    annotation_colors=ann_colors,
    labels_col=labels,
    width=12,
    height=6,
    filename="All_tumor_heatmap_update.png",
    breaks=seq(-2, 2, length.out=101))

#write.csv(log2(All_prep_heatmap+1), "tumor_deg_mrna.csv")



####################################################

#Ranking analysis

#prep
colnames(All_prep_heatmap) <- c(1:16)

#Ranked genes for HumanBase
Ranked <- log2(All_prep_heatmap+1) %>% rowwise() %>% dplyr::mutate(Ex_avg=mean(c_across(c(1,5,9,13)))) %>% 
          dplyr::mutate(EE_avg=mean(c_across(c(2,6,10,14)))) %>%
          dplyr::mutate(FF_avg=mean(c_across(c(3,7,11,15)))) %>%
          dplyr::mutate(UC_avg=mean(c_across(c(4,8,12,16)))) %>%
          dplyr::mutate(Ex_rank=case_when(Ex_avg==max(c_across(c('Ex_avg','EE_avg','FF_avg','UC_avg')))~1)) %>%
          dplyr::mutate(EE_rank=case_when(EE_avg==max(c_across(c('Ex_avg','EE_avg','FF_avg','UC_avg')))~1)) %>%
          dplyr::mutate(FF_rank=case_when(FF_avg==max(c_across(c('Ex_avg','EE_avg','FF_avg','UC_avg')))~1)) %>%
          dplyr::mutate(UC_rank=case_when(UC_avg==max(c_across(c('Ex_avg','EE_avg','FF_avg','UC_avg')))~1)) 

Ranked <- cbind(Ranked, Genes=rownames(All_prep_heatmap))

arrange(Ranked, desc(Ex_avg)) %>% arrange(., desc(Ex_rank))


####################################################################
#Supplementary GMNN figure
options(repr.plot.width = 12, repr.plot.height =10)

GMNN <- All.heatmap['GMNN',]
GMNN <- GMNN[,-17] 
colnames(GMNN) <- vol.metadata$Method
labels <- colnames(GMNN)
prep <- t(log2(GMNN+1)) %>% as.data.frame() %>% mutate(ID=rownames(.))
prep['ID'] <- labels %>% as.factor()
colours <- c("ExCy" ="red3", "ExoEasy"="orange", "Fujifilm"="grey", "UC"="blue2")
#
ggplot(prep, aes(x=GMNN, y=ID, fill=ID)) + 
    geom_boxplot(show.legend=F, alpha=0.5) +
    geom_point(aes(color=ID, size=50), show.legend=F)+
    scale_y_discrete(limits=rev) +
    scale_fill_manual(values=colours) + 
    scale_color_manual(values=colours)+
    theme_bw(base_size=25) +
    xlab(expression(bold(GMNN ~ Log[2] ~ Transcript))) + 
    theme(
        panel.grid.major = element_blank(),
        panel.grid.minor=element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(face="bold", size=35),
        axis.title.x=element_text(face="bold", size=35)
        ) + 
    ylab(element_blank()) 
    

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as the

Unnamed: 0_level_0,logFC,logCPM,LR,PValue,FDR
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GAPDHS,5.910818,4.135902,22.58558,2.009874e-06,0.01592021
RPS12,7.528071,3.919328,15.59221,7.857784e-05,0.13846198
TFF1,7.653591,3.662601,15.36555,8.858898e-05,0.13846198
CD247,1.495815,7.634337,15.31186,9.114229e-05,0.13846198
TMEM69,7.229293,3.142136,15.247,9.432628e-05,0.13846198
CDC25C,2.190536,6.332131,15.04672,0.0001048822,0.13846198
