# create figures for med4 DE long term 

In [1]:
library(DESeq2)
library(RColorBrewer)
library(reshape2)
library("gplots")
library("GGally")                      # Load GGally package
library(goseq)
library(readxl)
library(tidyverse)
library("pheatmap")
library(corrplot)
library(circlize)

Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: 'BiocGenerics'


The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs


The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min



Attaching package: 'S4Vectors'


The following objects are masked from 'package:base':

    expand.grid, I, unname


Loading required package: IRanges


Attaching package: 'IRanges'


The following object is masked from 'package:grDevices':

    windows


Loading required package: GenomicRanges

Loading req

In [2]:
library(ggrepel)

In [3]:
options(repr.plot.width = 10, repr.plot.height = 10, repr.plot.res = 300)

In [5]:
fname_hcounts = file.path('..', 'RNASEQ', 'data', 'MED4_counts_combined.txt')
hcdf = read.csv(fname_hcounts, sep='\t' )
hgenes = hcdf[,c(1,2,3,4,5)]
hpath_fname = file.path('..', 'genomes', 'MED4', 'MED4_pathways.csv')
hpath_df = read.csv(hpath_fname)
hpath_df = left_join(hgenes, hpath_df, by='protein_id', multiple='all')

In [6]:
hpath_df = hpath_df  %>% 
    extract(path, c("pathway"), "^[[:digit:]]* *([^\\[]+) *\\[?", remove = FALSE) %>%
    mutate(pathway = str_replace(pathway, "in photosynthetic organisms", "")) %>%
    mutate(pathway = str_trim(pathway))

In [7]:
hpath_df = hpath_df %>% 
    extract(sub, c("module"), "^[[:digit:]]* *([^\\[]+) *\\[?", remove = FALSE) %>%
    mutate(module = str_replace(module, "in photosynthetic organisms", "")) %>%
    mutate(module = str_replace(module, "metabolism", "")) %>%
    mutate(module = str_replace(module, "Biosynthesis of", "")) %>%
    mutate(module = str_replace(module, "Metabolism of", "")) %>%
    mutate(module = str_replace(module, "biosynthesis", "")) %>%
    mutate(module = str_trim(module)) 



In [8]:
hpath_df = hpath_df %>% 
    extract(ecpath, c("kegg_gene"), "^K[[:digit:]]+ *([^;]*);", remove = FALSE) 

In [9]:
hpath_df = hpath_df %>% 
    mutate(
        prev_gene = gene,
        gene = if_else(gene != '', gene, paste0('(',kegg_gene,')'))
        
    )

In [10]:
hpath_df_to_export = hpath_df %>% 
     group_by(gene_id) %>% 
     mutate(
         main = paste0(unique(main), collapse = ";"),
         sub = paste0(unique(sub), collapse = ";"),
         path = paste0(unique(path), collapse = ";"),
         ecpath = paste0(unique(ecpath), collapse = ";"),
         pathway = paste0(unique(pathway), collapse = ";"),
         module = paste0(unique(module), collapse = ";")
     ) %>%
    ungroup() %>%
    distinct(gene_id, .keep_all = TRUE)

In [11]:
df_all = read_csv(file.path('DE_results', 'MED4_all_FC_results.csv'))

[1mRows: [22m[34m24377[39m [1mColumns: [22m[34m11[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): contrast, gene_id, Assay, contrast1
[32mdbl[39m (5): logFC, AveExpr, pvalue, padj, Length
[33mlgl[39m (2): down, up

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [12]:
df_all = left_join(df_all, hpath_df_to_export, by='gene_id')

In [14]:
unique(df_all$contrast1)

In [19]:
df_all = df_all %>%
    filter(contrast1 %in% c('LATEvsC1_prot', 'LATEvsC1P1_rna'))

In [20]:
t(head(df_all))

0,1,2,3,4,5,6
contrast,Clongterm - C1,Clongterm - C1,Clongterm - C1,Clongterm - C1,Clongterm - C1,Clongterm - C1
gene_id,TX50_RS01980,TX50_RS00040,TX50_RS04695,TX50_RS01970,TX50_RS05170,TX50_RS05585
logFC,3.191075,-1.443788,1.373159,3.706744,2.210139,2.786227
AveExpr,0.9167156,1.5845359,2.2254880,3.6657560,-1.3469405,-1.7260158
pvalue,6.252913e-10,1.202458e-08,1.731707e-08,2.510390e-08,2.650587e-08,3.478220e-08
padj,8.904149e-07,7.127836e-06,7.127836e-06,7.127836e-06,7.127836e-06,7.127836e-06
Assay,Proteome,Proteome,Proteome,Proteome,Proteome,Proteome
contrast1,LATEvsC1_prot,LATEvsC1_prot,LATEvsC1_prot,LATEvsC1_prot,LATEvsC1_prot,LATEvsC1_prot
Length,852,2439,912,1491,1134,954
down,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE


In [24]:
# number of differentiall expressed genes
df_all %>%
    filter(padj < 0.05) %>% count(Assay, down)

Assay,down,n
<chr>,<lgl>,<int>
Proteome,False,208
Proteome,True,206
RNA,False,207
RNA,True,178


In [35]:
df_all %>%
    filter(padj < 0.05) %>% top_n(-50, logFC) %>% 
    select(Assay,logFC,gene, product, pathway) #%>% 
    #select(Assay,logFC,gene)

Assay,logFC,gene,product,pathway
<chr>,<dbl>,<chr>,<chr>,<chr>
Proteome,-3.062849,(pepE),peptidase E,Peptidases and inhibitors
Proteome,-3.089786,(NA),oligoketide cyclase,
Proteome,-2.437843,(NA),GAF domain-containing protein,
Proteome,-1.956601,(NA),hypothetical protein,
Proteome,-1.959779,raiA,ribosome-associated translation inhibitor RaiA,Ribosome biogenesis
Proteome,-2.659823,"(wecA, tagO, rfe)",undecaprenyl/decaprenyl-phosphate alpha-N-acetylglucosaminyl 1-phosphate transferase,O-Antigen repeat unit biosynthesis;Teichoic acid biosynthesis;Arabinogalactan biosynthesis - Mycobacterium;Glycosyltransferases;Lipopolysaccharide biosynthesis proteins
Proteome,-2.122447,(NA),DUF6447 family protein,
Proteome,-2.428226,"(rfbD, rmlD)",sugar nucleotide-binding protein,O-Antigen nucleotide sugar biosynthesis;Polyketide sugar unit biosynthesis;Streptomycin biosynthesis
RNA,-1.790114,gap,type I glyceraldehyde-3-phosphate dehydrogenase,Glycolysis / Gluconeogenesis;Carbon fixation
RNA,-2.005085,(K07086),sodium-dependent bicarbonate transport family permease,Function unknown


In [26]:
help(top_n)

0,1
top_n {dplyr},R Documentation

0,1
x,A data frame.
n,"Number of rows to return for top_n(), fraction of rows to return for top_frac(). If n is positive, selects the top rows. If negative, selects the bottom rows. If x is grouped, this is the number (or fraction) of rows per group. Will include more rows if there are ties."
wt,"(Optional). The variable to use for ordering. If not specified, defaults to the last variable in the tbl."
