# Pseudobulk differential expression to compare df1 vs WT

In [1]:
library(tidyverse)
library(Seurat)
library(cowplot)
library(ComplexHeatmap)
library(circlize)
library(GeneOverlap)
library(gprofiler2)
library(ggrepel)
library(muscat)
library(purrr)
library(limma)
library(scran)
library(future)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.5     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Registered S3 method overwritten by 'spatstat.geom':
  method     from
  print.boxx cli 

Attaching SeuratObject

Loading required package: grid

ComplexHeatmap version 2.11.1
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-

In [2]:
#for 200gb ram 
options(future.globals.maxSize = 200000 * 1024^2)

In [3]:
rc.integrated <- readRDS("../data/integrations/rc.integrated_11S_gtl1_df1_Li_seu3_clean.rds")

In [4]:
rc.integrated

An object of class Seurat 
70780 features across 74810 samples within 3 assays 
Active assay: integrated (17681 features, 17681 variable features)
 2 other assays present: RNA, SCT
 4 dimensional reductions calculated: pca, umap, umap_3D, umap_2D

## Cell type and developmental stage metadata

- Developmental stage: `time_zone`
- Cell type:`cell_type`
- Combination of cell type and developmental stage: `time_zone_cell_type`
- Combination of cell type and developmental stage with cell subtypes (not used): `time_zone_cell_subtypes` 

In [5]:
feature_names <- read_tsv("./data/features.tsv.gz", col_names = c("AGI", "Name", "Type")) %>%
  select(-Type) %>%
  distinct()


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  AGI = [31mcol_character()[39m,
  Name = [31mcol_character()[39m,
  Type = [31mcol_character()[39m
)




In [6]:
table(rc.integrated$genotype)


     df1     gtl1 gtl1_df1       WT 
   15678    22594    17634    18904 

In [7]:
rc.integrated <- subset(rc.integrated, 
                        subset = sample %in% c("sc_122",
                                              "sc_123",
                                              "sc_124",
                                              "sc_125",
                                              "sc_126",
                                              "sc_127",
                                              "sc_128",
                                              "sc_129"))

In [8]:
rc.integrated$genotype <- factor(rc.integrated$genotype, 
                                 levels=c("WT", 
                                          "gtl1", 
                                          "df1", 
                                          "gtl1_df1"))

In [9]:
table(rc.integrated$genotype)


      WT     gtl1      df1 gtl1_df1 
   18904    22594    15678    17634 

# Differential State Analysis with Muscat

In [10]:
# subset samples you want to compare

integrated.de <- subset(rc.integrated, subset = sample %in% c("sc_122", "sc_126","sc_124","sc_128"))
integrated.de$genotype <- factor(integrated.de$genotype, levels=c("WT", "df1"))

In [11]:
integrated.de

An object of class Seurat 
70780 features across 34582 samples within 3 assays 
Active assay: integrated (17681 features, 17681 variable features)
 2 other assays present: RNA, SCT
 4 dimensional reductions calculated: pca, umap, umap_3D, umap_2D

## Convert to sce

In [12]:
#  construct sce manually
my_metadata <- data.frame(sample_id = integrated.de$sample,
                              group_id = integrated.de$genotype,
                              cluster_id = integrated.de$time_zone_cell_type, 
                             rep=integrated.de$rep) # include experimental rep as co-variate

sce <- SingleCellExperiment(assays = list(counts = integrated.de@assays$RNA@counts),
	                            colData = my_metadata)

In [13]:
    (sce <- prepSCE(sce, 
        kid = "cluster_id", # subpopulation assignments
        gid = "group_id",   # group IDs (ctrl/stim)
        sid = "sample_id",    # sample IDs (ctrl/stim.1234)
        drop = FALSE))        # drop all other colData columns

class: SingleCellExperiment 
dim: 28688 34582 
metadata(1): experiment_info
assays(1): counts
rownames(28688): AT1G01010 AT1G01020 ... AT5G37474 AT5G07835
rowData names(0):
colnames(34582): AAACCCAAGAGCCGTA_4 AAACCCAAGCATTGTC_4 ...
  TTTGTTGGTACGCTTA_10 TTTGTTGTCAGTGTTG_10
colData names(4): cluster_id sample_id group_id rep
reducedDimNames(0):
altExpNames(0):

## pre-filtering

In [14]:
# remove undetected genes
sce <- sce[rowSums(counts(sce) > 0) > 0, ]
dim(sce)

In [15]:
# remove lowly expressed genes
sce <- sce[rowSums(counts(sce) > 1) >= 1, ]
dim(sce)

In [16]:
# create pseudobulk profiles
pb <- aggregateData(sce,
    assay = "counts", fun = "sum",
    by = c("cluster_id", "sample_id"))
# one sheet per subpopulation
assayNames(pb)

In [17]:
# pseudobulks for 1st subpopulation
t(head(assay(pb)))

Unnamed: 0,AT1G01010,AT1G01020,AT1G01030,AT1G01040,AT1G01050,AT1G01060
sc_122,37,32,12,9,862,6
sc_124,8,20,5,5,478,1
sc_126,26,41,17,12,1287,9
sc_128,17,31,22,8,932,7


In [18]:
# metadata to manually add to DE contrast
bscs <- read.csv("./data/GEO_upload_BR_time_scRNA_samples_metadata_with_stats.csv", na.strings=c("","NA"), stringsAsFactors = F)
bscs$date <- gsub('^([0-9]{4})([0-9]{2})([0-9]+)$', '\\1-\\2-\\3', bscs$date)

In [19]:
# experiment info for contrasts, add rep from csv

ei <- metadata(sce)$experiment_info
ei

sample_date <- select(bscs, sample_id=sample, rep=rep)

ei <- left_join(ei, sample_date)

ei

sample_id,group_id,n_cells
<fct>,<fct>,<dbl>
sc_122,WT,11614
sc_124,df1,9058
sc_126,WT,7290
sc_128,df1,6620


Joining, by = "sample_id"



sample_id,group_id,n_cells,rep
<chr>,<fct>,<dbl>,<int>
sc_122,WT,11614,1
sc_124,df1,9058,1
sc_126,WT,7290,2
sc_128,df1,6620,2


In [20]:
mm <- model.matrix(~ 0 + ei$group_id + ei$rep)
dimnames(mm) <- list(ei$sample_id, c(levels(ei$group_id), "rep"))

mm

Unnamed: 0,WT,df1,rep
sc_122,1,0,1
sc_124,0,1,1
sc_126,1,0,2
sc_128,0,1,2


In [21]:
contrast <- makeContrasts("df1-WT", levels = mm)

contrast

Unnamed: 0,df1-WT
WT,-1
df1,1
rep,0


In [22]:
res <- pbDS(pb, design = mm, 
            contrast = contrast, 
            method="edgeR", 
            min_cells=5, 
            filter = c("none"))

Distal Columella..Distal Lateral Root Cap..Elongation_Atrichoblast..Elongation_Cortex..Elongation_Endodermis..Elongation_Pericycle..Elongation_Phloem..Elongation_Procambium..Elongation_Trichoblast..Elongation_Xylem..Maturation_Atrichoblast..Maturation_Cortex..Maturation_Endodermis..Maturation_Pericycle..Maturation_Phloem..Maturation_Procambium..Maturation_Trichoblast..Maturation_Xylem..Proliferation Domain_Atrichoblast..Proliferation Domain_Cortex..Proliferation Domain_Endodermis..Proliferation Domain_Pericycle..Proliferation Domain_Quiescent Center..Proliferation Domain_Trichoblast..Proliferation Domain_Xylem..Proximal Columella..Proximal Lateral Root Cap..Transition Domain_Atrichoblast..Transition Domain_Cortex..Transition Domain_Pericycle..Transition Domain_Phloem..Transition Domain_Trichoblast..Transition Domain_Xylem..

### DEG results

In [23]:
# DEG results with gene freqs
(res_to_write_frq <- resDS(sce, res, bind = "row", cpm=TRUE, frq=T))

gene,cluster_id,sc_122.cpm,sc_126.cpm,sc_124.cpm,sc_128.cpm,sc_122.frq,sc_126.frq,sc_124.frq,sc_128.frq,WT.frq,df1.frq,logFC,logCPM,F,p_val,p_adj.loc,p_adj.glb,contrast
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
AT1G01010,Distal Columella,10.700,6.290,4.900,4.860,0.06100,0.05040,0.03470,0.04140,0.055900,0.03900,-0.69400,2.9100,3.49e+00,6.19e-02,0.53900,1,df1-WT
AT1G01020,Distal Columella,9.210,9.920,12.200,8.860,0.05570,0.07170,0.09410,0.07730,0.063400,0.08330,0.08750,3.3800,8.36e-02,7.73e-01,0.98400,1,df1-WT
AT1G01030,Distal Columella,3.450,4.110,3.060,6.280,0.01970,0.03290,0.01980,0.05250,0.026100,0.04080,0.34700,2.3200,5.66e-01,4.52e-01,0.95300,1,df1-WT
AT1G01040,Distal Columella,2.590,2.900,3.060,2.290,0.01620,0.01940,0.02480,0.02210,0.017700,0.02300,-0.09930,1.7200,2.56e-02,8.73e-01,0.98400,1,df1-WT
AT1G01050,Distal Columella,248.000,311.000,293.000,266.000,0.69300,0.77900,0.83700,0.75700,0.734000,0.78500,0.00032,8.1300,4.44e-06,9.98e-01,1.00000,1,df1-WT
AT1G01060,Distal Columella,1.730,2.180,0.612,2.000,0.01080,0.01740,0.00495,0.01930,0.014000,0.01420,-0.46800,1.2800,3.15e-01,5.75e-01,0.95300,1,df1-WT
AT1G01070,Distal Columella,0.576,0.242,0.612,0.000,0.00359,0.00194,0.00495,0.00000,0.002800,0.00177,-0.66000,-0.0846,7.49e-02,7.84e-01,0.98400,1,df1-WT
AT1G01080,Distal Columella,0.288,0.242,0.000,0.857,0.00180,0.00194,0.00000,0.00829,0.001860,0.00532,0.84700,0.0278,1.72e-01,6.78e-01,0.96400,1,df1-WT
AT1G01090,Distal Columella,37.700,40.400,31.200,35.400,0.19600,0.26400,0.18800,0.26000,0.228000,0.23400,-0.22500,5.2200,1.34e+00,2.48e-01,0.92100,1,df1-WT
AT1G01100,Distal Columella,268.000,239.000,298.000,325.000,0.68000,0.67600,0.76200,0.70400,0.678000,0.72500,0.29800,8.1400,3.85e+00,4.97e-02,0.48200,1,df1-WT


In [24]:
## all genes background 

all_bg <- filter(res_to_write_frq,
                WT.frq >=0.1 | df1.frq >=0.1)

In [25]:
length(unique(all_bg$gene))

In [26]:
#total DE genes p_adj.loc < 0.05, abs(logFC) > 1.5
sig_DE <- filter(res_to_write_frq, p_adj.loc<=0.05 & abs(logFC) > log2(1.5))
sig_DE <- left_join(sig_DE, feature_names, by=c("gene"="AGI"))

length(unique(sig_DE$gene))

In [27]:
# filter gene freqs to avoid calling lowly detected genes
sig_DE_fil <- filter(sig_DE, WT.frq >=0.1 | df1.frq >=0.1)

In [28]:
length(unique(sig_DE_fil$gene))

In [29]:
# load TFs
TF_list <- read_csv("./data/Kay_TF_thalemine_annotations.csv", col_names = c("gene", "TF_Name", "Description")) 


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  gene = [31mcol_character()[39m,
  TF_Name = [31mcol_character()[39m,
  Description = [31mcol_character()[39m
)




In [30]:
sig_DE_fil <- left_join(sig_DE_fil, TF_list)

Joining, by = "gene"



In [31]:
# label up vs down
sig_DE_fil <- sig_DE_fil %>%
  mutate(up_dn_label = case_when(logFC >=log2(1.5) ~ "Up",  
                                       logFC <=log2(1/1.5) ~ "Down",
                                       TRUE ~ "Not DE"))

sig_DE_fil$clust_up_dn <- paste(sig_DE_fil$cluster_id, sig_DE_fil$up_dn_label, sep="_")

sig_DE_fil

gene,cluster_id,sc_122.cpm,sc_126.cpm,sc_124.cpm,sc_128.cpm,sc_122.frq,sc_126.frq,sc_124.frq,sc_128.frq,⋯,F,p_val,p_adj.loc,p_adj.glb,contrast,Name,TF_Name,Description,up_dn_label,clust_up_dn
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
AT1G02360,Distal Columella,38.30,57.60,82.00,91.10,0.1560,0.2330,0.3070,0.3150,⋯,24.7,6.67e-07,8.12e-05,3.49e-01,df1-WT,AT1G02360,,,Up,Distal Columella_Up
AT1G03475,Distal Columella,17.00,21.30,28.20,36.30,0.0969,0.1550,0.1780,0.2730,⋯,12.5,4.03e-04,1.64e-02,1.00e+00,df1-WT,CPX1,,,Up,Distal Columella_Up
AT1G03850,Distal Columella,35.40,39.20,26.30,12.60,0.1830,0.2230,0.1730,0.0967,⋯,24.6,7.30e-07,8.73e-05,3.82e-01,df1-WT,AT1G03850,,,Down,Distal Columella_Down
AT1G03870,Distal Columella,163.00,100.00,53.30,94.80,0.4690,0.4840,0.2920,0.3400,⋯,21.2,4.08e-06,3.69e-04,1.00e+00,df1-WT,FLA9,,,Down,Distal Columella_Down
AT1G05060,Distal Columella,20.40,30.70,47.10,48.30,0.1010,0.1690,0.1980,0.2600,⋯,20.8,5.23e-06,4.51e-04,1.00e+00,df1-WT,AT1G05060,,,Up,Distal Columella_Up
AT1G05260,Distal Columella,45.80,40.60,20.80,10.90,0.1510,0.1670,0.1530,0.0663,⋯,49.7,1.90e-12,8.60e-10,9.94e-07,df1-WT,PER3,,,Down,Distal Columella_Down
AT1G05340,Distal Columella,87.50,83.20,56.30,35.70,0.4040,0.4320,0.3610,0.2540,⋯,28.6,9.01e-08,1.42e-05,4.71e-02,df1-WT,AT1G05340,,,Down,Distal Columella_Down
AT1G05575,Distal Columella,47.20,82.70,108.00,104.00,0.1440,0.2110,0.2820,0.2850,⋯,18.8,1.45e-05,1.06e-03,1.00e+00,df1-WT,AT1G05575,,,Up,Distal Columella_Up
AT1G05870,Distal Columella,22.70,27.60,11.00,12.90,0.1150,0.1740,0.0842,0.1050,⋯,19.7,9.14e-06,7.15e-04,1.00e+00,df1-WT,AT1G05870,,,Down,Distal Columella_Down
AT1G06080,Distal Columella,10.70,15.20,69.20,53.40,0.0503,0.0853,0.3660,0.2790,⋯,113.0,2.52e-26,5.45e-23,1.32e-20,df1-WT,ADS1,,,Up,Distal Columella_Up


In [32]:
sig_DE_fil
write.csv(sig_DE_fil, file = "./output/v4_df1_v_WT_cell_time_EdgeR_q0.05_FC1.5_r_v_4_20220121.csv")

gene,cluster_id,sc_122.cpm,sc_126.cpm,sc_124.cpm,sc_128.cpm,sc_122.frq,sc_126.frq,sc_124.frq,sc_128.frq,⋯,F,p_val,p_adj.loc,p_adj.glb,contrast,Name,TF_Name,Description,up_dn_label,clust_up_dn
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
AT1G02360,Distal Columella,38.30,57.60,82.00,91.10,0.1560,0.2330,0.3070,0.3150,⋯,24.7,6.67e-07,8.12e-05,3.49e-01,df1-WT,AT1G02360,,,Up,Distal Columella_Up
AT1G03475,Distal Columella,17.00,21.30,28.20,36.30,0.0969,0.1550,0.1780,0.2730,⋯,12.5,4.03e-04,1.64e-02,1.00e+00,df1-WT,CPX1,,,Up,Distal Columella_Up
AT1G03850,Distal Columella,35.40,39.20,26.30,12.60,0.1830,0.2230,0.1730,0.0967,⋯,24.6,7.30e-07,8.73e-05,3.82e-01,df1-WT,AT1G03850,,,Down,Distal Columella_Down
AT1G03870,Distal Columella,163.00,100.00,53.30,94.80,0.4690,0.4840,0.2920,0.3400,⋯,21.2,4.08e-06,3.69e-04,1.00e+00,df1-WT,FLA9,,,Down,Distal Columella_Down
AT1G05060,Distal Columella,20.40,30.70,47.10,48.30,0.1010,0.1690,0.1980,0.2600,⋯,20.8,5.23e-06,4.51e-04,1.00e+00,df1-WT,AT1G05060,,,Up,Distal Columella_Up
AT1G05260,Distal Columella,45.80,40.60,20.80,10.90,0.1510,0.1670,0.1530,0.0663,⋯,49.7,1.90e-12,8.60e-10,9.94e-07,df1-WT,PER3,,,Down,Distal Columella_Down
AT1G05340,Distal Columella,87.50,83.20,56.30,35.70,0.4040,0.4320,0.3610,0.2540,⋯,28.6,9.01e-08,1.42e-05,4.71e-02,df1-WT,AT1G05340,,,Down,Distal Columella_Down
AT1G05575,Distal Columella,47.20,82.70,108.00,104.00,0.1440,0.2110,0.2820,0.2850,⋯,18.8,1.45e-05,1.06e-03,1.00e+00,df1-WT,AT1G05575,,,Up,Distal Columella_Up
AT1G05870,Distal Columella,22.70,27.60,11.00,12.90,0.1150,0.1740,0.0842,0.1050,⋯,19.7,9.14e-06,7.15e-04,1.00e+00,df1-WT,AT1G05870,,,Down,Distal Columella_Down
AT1G06080,Distal Columella,10.70,15.20,69.20,53.40,0.0503,0.0853,0.3660,0.2790,⋯,113.0,2.52e-26,5.45e-23,1.32e-20,df1-WT,ADS1,,,Up,Distal Columella_Up


In [33]:
# add DE and up/dn to total list
sig_to_join <- sig_DE_fil %>%
mutate(clust_gene=paste(cluster_id, gene, sep="_")) %>%
select(clust_gene, up_dn_label, clust_up_dn)

In [34]:
# join all genes list to DE labels
all_bg <- mutate(all_bg, clust_gene=paste(cluster_id, gene, sep="_"))

all_bg <- left_join(all_bg, feature_names, by=c("gene"="AGI"))

all_bg$DE <- all_bg$clust_gene %in% sig_to_join$clust_gene


all_bg <- all_bg %>%
left_join(sig_to_join, by="clust_gene") %>%
arrange(all_bg, p_adj.loc)

write.csv(all_bg, file = "./output/v4_all_df1_v_WT_cell_time_EdgeR_q0.05_FC1.5_r_v_4_20220121.csv")