In [None]:
#load libraries

library(GenomicRanges, verbose=FALSE) 
library(pheatmap) 
library(dplyr)
library(tidyr)
library(ggplot2)
library(DESeq2)
library(colorRamp2)
library(ComplexHeatmap)
library(tximport)

##Creating Graphs
### Import raw (gene) count data ###

directory = "/Users/tisesuzuki/RNAseq/htseq"

sampleFiles <- grep("PDX4",list.files(directory),value=TRUE)
sampleCondition <- c('PDX4_CR', 'PDX4_CR', 'PDX4_CR',
                     'PDX4_SE', 'PDX4_SE', 'PDX4_SE')
sampleTable <- data.frame(sampleName = sampleFiles,
                          fileName = sampleFiles,
                          condition = condition)
sampleTable$condition <- factor(sampleTable$condition)

ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
                                       directory = directory,
                                       design= ~ condition)
ddsHTSeq

### Filter out genes with counts of less than 10 in all samples ###

keepGenes <- rowSums(counts(ddsHTSeq)) >= 10
dds_1 <- ddsHTSeq[keepGenes,]
summary(dds_1)
dim(dds_1)

dds_1$condition <- relevel(dds_1$condition, ref = "PDX4_SE")
dds_1 <- DESeq(dds_1)
vsdata <- vst(dds_1, blind=FALSE)
plotPCA(vsdata, intgroup = "condition")
plotDispEsts(dds_1)
res <- results(dds_1, contrast = c("condition", "PDX4_SE", "PDX4_CR"))
res
sigs <- na.omit(res)
sigs <- sigs[sigs$padj < 0.1,]
sigs

write.csv(res, file = "deseq_results.csv")

df <- as.data.frame(sigs)
df

#getting transcript ID - gene symbol table
mart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id",
                                     "external_gene_name"), mart = mart)
t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id,
                     ens_gene = ensembl_gene_id, ext_gene = external_gene_name)

genes <- unique(t2g[,2:3]) #select unique gene names

keys <- genes$ens_gene
values <- genes$ext_gene
l <- list()
for (i in 1:length(keys)){
  l[keys[i]] <- values[i]
}
#for non-mapped labels
no_values <- setdiff(rownames(df), keys)
for (i in 1:length(no_values)){
  l[no_values[i]] <- 'NA'
}
df$symbol <- unlist(l[rownames(df)], use.names = FALSE)

df_top <- df[df$padj <= 0.1 & (abs(df$log2FoldChange) > 0.58),]
df_top <- df_top[order(df_top$log2FoldChange, decreasing = TRUE),]

rlog_out <- rlog(dds_1, blind = FALSE)

sampleTable_2 <- sampleTable[,-1]
rownames(sampleTable_2) <- sampleTable[,1]
coldata <- sampleTable_2[-c(1)]
mat <- assay(rlog_out)[rownames(df_top), rownames(coldata)]
colnames(mat) <- rownames(coldata)
base_mean <- rowMeans(mat)
mat.scaled <- t(apply(mat,1,scale))
colnames(mat.scaled) <- colnames(mat)

num_keep <- 25
rows_keep <- c(seq(1:num_keep), seq((nrow(mat.scaled)-num_keep), nrow(mat.scaled)))

l2_val <- as.matrix(df_top[rows_keep,]$log2FoldChange)
colnames(l2_val) <- "logFC"

mean <- as.matrix(df_top[rows_keep,]$baseMean)
colnames(mean) <- "AvgExp"

col_logFC <- colorRamp2(c(min(l2_val),0, max(l2_val)), c("gray50", "white", "#BE575F"))
col_AvgExp <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white", "#941A2B"))

ha <- HeatmapAnnotation(summary = anno_summary(gp = gpar(fill = 2), 
                                               height = unit(2, "cm")))

h1 <- Heatmap(mat.scaled[rows_keep,], cluster_rows = F, colorRampPalette(c("black","white","#941A2B"))(50),
              column_labels = colnames(mat.scaled), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(l2_val, row_labels = df_top$symbol[rows_keep], 
              cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(l2_val[i, j],2), x, y)
              })
h3 <- Heatmap(mean, row_labels = df_top$symbol[rows_keep], 
              cluster_rows = F, name = "AvgExp", col=col_AvgExp,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(mean[i, j],2), x, y)
              })

h<-h1+h2+h3
h
png("~/RNAseq/results/20240108_heatmap.png", res = 300, width = 2000, height = 5500)
print(h)


### PROGENy ###
library(progeny)
# annotate matrix with HGNC symbols
matched = match(rownames(mat), genes$ens_gene)
rownames(mat) = genes$ext_gene[matched]

pathways <- progeny(mat, 
                    scale=T,
                    organism="Human")
pathways_tidy <- as.data.frame(pathways)
pathways_tidy$sample <- rownames(pathways)
pathways_tidy$Treatment <- c("Sensitive", "Sensitive", "Sensitive",
                             "Resistant", "Resistant", "Resistant")
pathways_tidy <- gather(pathways_tidy, key = "Pathway", value = "Activity",
                        -sample, -Treatment)

testPathway <- function(pathway){
  pathways_filtered <- filter(pathways_tidy, Pathway == pathway)
  model_result <- summary(lm(Activity ~ Treatment, data=pathways_filtered))
  results <- data.frame(pathway = pathway,
                        Coefficient = model_result$coefficients[2,1],
                        pval = model_result$coefficients[2,4])
  return(results)
}

pathway_test <- lapply(colnames(pathways), testPathway)
pathway_test <- do.call("rbind", pathway_test)
pathway_test$padj <- p.adjust(pathway_test$pval, method = "BH")

sig_pathways <- filter(pathway_test, padj <= 0.05) %>%
  pull(pathway)

pheatmap(t(pathways),
         colorRampPalette(c("black", "white", "#941A2B"))(100),
         breaks = seq(-2, 2, length.out=101),
         cluster_cols=F,
         clustering_method = "ward.D2",
         border_color = "black",
         filename="~/RNAseq/results/rlog_progeny_rr.png",
         width=3.2, height=4.2)

### Overlap of DEGs associated with Cisplatin Resistance, Cancer Stemness, and EMT
#Cisplatin Resistance (Huang D, 2021)
CR <- which(df$symbol %in% Platinum_Resistent_Genes$`HUGO Gene symbol`)
df$Cisplatin_Resistance <- ""
df$Cisplatin_Resistance[CR] <- df$symbol[CR]

CSC <- which(df$symbol %in% BCSCdb$...1)
df$CSC <- ""
df$CSC[CSC] <- df$symbol[CSC]

EMT <- which(df$symbol %in% EMT_gene_sig$Gene)
df$EMT <- ""
df$EMT[EMT] <- df$symbol[EMT]

write.table(df, file.path("~/RNAseq/results/20240107_RR_CSC_EMT_RES_markers.txt"), row.names=TRUE, sep="\t",
            quote=FALSE)

### Volcano Plot ###

labels_volcano <- c("CAV1", "WNT5A", "POSTN", "ITGA6", "CCN2", 
                    "COL1A1", "SNAI2", "SLC7A5", "ERBB3", "MMP10", 
                    "LUM", "COL6A3", "SPP1", "COL16A1", "FGF9",
                    "PLCB", "POU5F1B", "BGN", "SOX8", "ID1", "KRT8")

label_rows_rr <- which(df$symbol %in% labels_volcano)
df$labels <- "" #create column for volcano plot genes
df$labels[label_rows_rr] <- df$symbol[label_rows_rr] #add gene names to label rows

#labeling sig data
df$sig <- "No"
df$sig[which(df$padj <= 0.1 & abs(df$log2FoldChange) > 0.58)] <- "Yes"

library(ggrepel)
volcano_rr <- ggplot(df, aes(x=log2FoldChange, y=-log10(padj))) +
  geom_point(size=2, alpha=0.5, aes(color=sig)) +
  geom_vline(xintercept=c(-0.58, 0.58), linetype=2) +
  geom_hline(yintercept=-log10(0.1), linetype=2) +
  geom_text_repel(aes(label=labels),
                  direction = "both",
                  max.overlaps=Inf,
                  size=4,
                  segment.alpha=0.75,
                  segment.color="grey50",
                  min.segment.length = 0.1,
                  force=25,
                  max.iter=5000) +
  xlab("logFC") + ylab("-log10(Adj. P-value)") +
  scale_color_manual(values=c("grey40", "#941A2B")) +
  theme_classic() +
  xlim(c(-10,10)) + ylim(c(0,500)) +
  theme(legend.position="none",
        axis.text=element_text(size=12, color="black"),
        axis.title=element_text(size=14, color="black"))

ggsave(volcano_rr, file="~/RNAseq/results/20240107_rr_volcano_plot_labels.pdf", width = 7, height = 7)
ggsave(volcano_rr, file="~/RNAseq/results/20240107_rr_volcano_plot_labels.png", width = 7, height = 7, dpi = 300)


### Box and Whiskers Plots for Individual Genes ###
#rename table columns to make the names uniform (cell line_treatment_replicate number)
gene_exp <- assay(rlog(dds_1, blind = FALSE))
gene_df <- as.data.frame(gene_expr)
gene_df$SYMBOL <- rownames(gene_df)
gene_df <- gene_df %>% 
  rename("PDX4_SE_1" = "PDX4_SE1",
         "PDX4_SE_2" = "PDX4_SE2",
         "PDX4_SE_3" = "PDX4_SE3",
         "PDX4_CR_1" = "PDX4_CR1",
         "PDX4_CR_2" = "PDX4_CR2",
         "PDX4_CR_3" = "PDX4_CR3",)

exp_tidy_rr <- gather(gene_df, "Sample", "Expression", -SYMBOL)
exp_tidy_rr <- separate(exp_tidy_rr, Sample, into=c("Cell_Line", "Treatment", "Replicate"))
exp_tidy_rr$Treatment <- factor(exp_tidy_rr$Treatment, levels = c("SE", "CR"))

emt_csc_markers <- c("CDH2", "VIM", "TGFB1", "TGFB2", "ZEB2",
                     "BMI1", "ENG", "MYC", "NOTCH1", "POU5F1B")

rr_exp <- gene_df %>% filter(SYMBOL %in% emt_csc_markers)
rr_exp <- rr_exp[,c("SYMBOL", 
                    "PDX4_SE_1", "PDX4_SE_2", "PDX4_SE_3",
                    "PDX4_CR_1", "PDX4_CR_2", "PDX4_CR_3")]  
rr_exp <- tidyr::gather(rr_exp, key="Sample", value="Expression", 
                        -SYMBOL)
rr_exp$Condition <- ifelse(rr_exp$Sample %in% c("PDX4_SE_1",
                                                "PDX4_SE_2",
                                                "PDX4_SE_3"),
                           "Sensitive", "Resistant")

rr_exp$Condition <- factor(rr_exp$Condition, levels = c("Sensitive", "Resistant"))
rr_exp$SYMBOL <- factor(rr_exp$SYMBOL, levels = c("CDH2", "BMI1", "VIM", "ENG", "TGFB1", "MYC",
                                                      "TGFB2", "NOTCH1", "ZEB2", "POU5F1B"))

exp_plot <- ggplot(rr_exp, aes(x=Condition, y=Expression)) +
  geom_boxplot(color="black", aes(fill=Condition)) +
  geom_point(color="black", size=2.5) +
  scale_fill_manual(values=c("gray25", "#941A2B")) +
  ylab("Normalized Counts") + xlab("") +
  facet_wrap(~SYMBOL, scales="free", ncol = 2 ) +
  theme_bw() +
  theme(legend.position="none",
        axis.text.y=element_text(size=12, color="black"),
        axis.text.x=element_text(size=12, color="black", angle=45, hjust=1),
        axis.title = element_text(size=12),
        strip.background = element_blank(),
        strip.text = element_text(size=14),
        panel.grid=element_blank())
exp_plot

ggsave(exp_plot, filename="~/RNAseq/results/20240108_single_genes.png", width=3, height=10)

