# Pseudobulk differential expression to compare gtl1 df1 vs WT

In [1]:
library(tidyverse)
library(Seurat)
library(cowplot)
library(ComplexHeatmap)
library(circlize)
library(GeneOverlap)
library(gprofiler2)
library(ggrepel)
library(muscat)
library(purrr)
library(limma)
library(scran)
library(future)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.5     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Registered S3 method overwritten by 'spatstat.geom':
  method     from
  print.boxx cli 

Attaching SeuratObject

Loading required package: grid

ComplexHeatmap version 2.11.1
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-

In [2]:
#for 200gb ram 
options(future.globals.maxSize = 200000 * 1024^2)

In [3]:
rc.integrated <- readRDS("../data/integrations/rc.integrated_11S_gtl1_df1_Li_seu3_clean.rds")

In [4]:
rc.integrated

An object of class Seurat 
70780 features across 74810 samples within 3 assays 
Active assay: integrated (17681 features, 17681 variable features)
 2 other assays present: RNA, SCT
 4 dimensional reductions calculated: pca, umap, umap_3D, umap_2D

## Cell type and developmental stage metadata

- Developmental stage: `time_zone`
- Cell type:`cell_type`
- Combination of cell type and developmental stage: `time_zone_cell_type`
- Combination of cell type and developmental stage with cell subtypes (not used): `time_zone_cell_subtypes` 

In [5]:
feature_names <- read_tsv("./data/features.tsv.gz", col_names = c("AGI", "Name", "Type")) %>%
  select(-Type) %>%
  distinct()


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  AGI = [31mcol_character()[39m,
  Name = [31mcol_character()[39m,
  Type = [31mcol_character()[39m
)




In [6]:
table(rc.integrated$genotype)


     df1     gtl1 gtl1_df1       WT 
   15678    22594    17634    18904 

In [7]:
rc.integrated <- subset(rc.integrated, 
                        subset = sample %in% c("sc_122",
                                              "sc_123",
                                              "sc_124",
                                              "sc_125",
                                              "sc_126",
                                              "sc_127",
                                              "sc_128",
                                              "sc_129"))

In [8]:
rc.integrated$genotype <- factor(rc.integrated$genotype, 
                                 levels=c("WT", 
                                          "gtl1", 
                                          "df1", 
                                          "gtl1_df1"))

In [9]:
table(rc.integrated$genotype)


      WT     gtl1      df1 gtl1_df1 
   18904    22594    15678    17634 

# Differential State Analysis with Muscat

In [10]:
# subset samples you want to compare

integrated.de <- subset(rc.integrated, subset = sample %in% c("sc_122", "sc_126","sc_125","sc_129"))
integrated.de$genotype <- factor(integrated.de$genotype, levels=c("WT", "gtl1_df1"))

In [11]:
integrated.de

An object of class Seurat 
70780 features across 36538 samples within 3 assays 
Active assay: integrated (17681 features, 17681 variable features)
 2 other assays present: RNA, SCT
 4 dimensional reductions calculated: pca, umap, umap_3D, umap_2D

## Convert to sce

In [12]:
#  construct sce manually
my_metadata <- data.frame(sample_id = integrated.de$sample,
                              group_id = integrated.de$genotype,
                              cluster_id = integrated.de$time_zone_cell_type, 
                             rep=integrated.de$rep) # include experimental rep as co-variate

sce <- SingleCellExperiment(assays = list(counts = integrated.de@assays$RNA@counts),
	                            colData = my_metadata)

In [13]:
    (sce <- prepSCE(sce, 
        kid = "cluster_id", # subpopulation assignments
        gid = "group_id",   # group IDs (ctrl/stim)
        sid = "sample_id",    # sample IDs (ctrl/stim.1234)
        drop = FALSE))        # drop all other colData columns

class: SingleCellExperiment 
dim: 28688 36538 
metadata(1): experiment_info
assays(1): counts
rownames(28688): AT1G01010 AT1G01020 ... AT5G37474 AT5G07835
rowData names(0):
colnames(36538): AAACCCAAGAGCCGTA_4 AAACCCAAGCATTGTC_4 ...
  TTTGTTGTCGGCTGGT_11 TTTGTTGTCTCTGACC_11
colData names(4): cluster_id sample_id group_id rep
reducedDimNames(0):
altExpNames(0):

## pre-filtering

In [14]:
# remove undetected genes
sce <- sce[rowSums(counts(sce) > 0) > 0, ]
dim(sce)

In [15]:
# remove lowly expressed genes
sce <- sce[rowSums(counts(sce) > 1) >= 1, ]
dim(sce)

In [16]:
# create pseudobulk profiles
pb <- aggregateData(sce,
    assay = "counts", fun = "sum",
    by = c("cluster_id", "sample_id"))
# one sheet per subpopulation
assayNames(pb)

In [17]:
# pseudobulks for 1st subpopulation
t(head(assay(pb)))

Unnamed: 0,AT1G01010,AT1G01020,AT1G01030,AT1G01040,AT1G01050,AT1G01060
sc_122,37,32,12,9,862,6
sc_125,22,43,11,11,765,2
sc_126,26,41,17,12,1287,9
sc_129,18,36,9,8,735,3


In [18]:
# metadata to manually add to DE contrast
bscs <- read.csv("./data/GEO_upload_BR_time_scRNA_samples_metadata_with_stats.csv", na.strings=c("","NA"), stringsAsFactors = F)
bscs$date <- gsub('^([0-9]{4})([0-9]{2})([0-9]+)$', '\\1-\\2-\\3', bscs$date)

In [19]:
# experiment info for contrasts, add rep from csv

ei <- metadata(sce)$experiment_info
ei

sample_date <- select(bscs, sample_id=sample, rep=rep)

ei <- left_join(ei, sample_date)

ei

sample_id,group_id,n_cells
<fct>,<fct>,<dbl>
sc_122,WT,11614
sc_125,gtl1_df1,10012
sc_126,WT,7290
sc_129,gtl1_df1,7622


Joining, by = "sample_id"



sample_id,group_id,n_cells,rep
<chr>,<fct>,<dbl>,<int>
sc_122,WT,11614,1
sc_125,gtl1_df1,10012,1
sc_126,WT,7290,2
sc_129,gtl1_df1,7622,2


In [20]:
mm <- model.matrix(~ 0 + ei$group_id + ei$rep)
dimnames(mm) <- list(ei$sample_id, c(levels(ei$group_id), "rep"))

mm

Unnamed: 0,WT,gtl1_df1,rep
sc_122,1,0,1
sc_125,0,1,1
sc_126,1,0,2
sc_129,0,1,2


In [21]:
contrast <- makeContrasts("gtl1_df1-WT", levels = mm)

contrast

Unnamed: 0,gtl1_df1-WT
WT,-1
gtl1_df1,1
rep,0


In [22]:
res <- pbDS(pb, design = mm, 
            contrast = contrast, 
            method="edgeR", 
            min_cells=5, 
            filter = c("none"))

Distal Columella..Distal Lateral Root Cap..Elongation_Atrichoblast..Elongation_Cortex..Elongation_Endodermis..Elongation_Pericycle..Elongation_Phloem..Elongation_Procambium..Elongation_Trichoblast..Elongation_Xylem..Maturation_Atrichoblast..Maturation_Cortex..Maturation_Endodermis..Maturation_Pericycle..Maturation_Phloem..Maturation_Procambium..Maturation_Trichoblast..Maturation_Xylem..Proliferation Domain_Atrichoblast..Proliferation Domain_Cortex..Proliferation Domain_Endodermis..Proliferation Domain_Pericycle..Proliferation Domain_Quiescent Center..Proliferation Domain_Trichoblast..Proximal Columella..Proximal Lateral Root Cap..Transition Domain_Atrichoblast..Transition Domain_Cortex..Transition Domain_Pericycle..Transition Domain_Phloem..Transition Domain_Trichoblast..Transition Domain_Xylem..

### DEG results

In [23]:
# DEG results with gene freqs
(res_to_write_frq <- resDS(sce, res, bind = "row", cpm=TRUE, frq=T))

gene,cluster_id,sc_122.cpm,sc_126.cpm,sc_125.cpm,sc_129.cpm,sc_122.frq,sc_126.frq,sc_125.frq,sc_129.frq,WT.frq,gtl1_df1.frq,logFC,logCPM,F,p_val,p_adj.loc,p_adj.glb,contrast
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
AT1G01010,Distal Columella,10.500,6.230,6.110,4.870,0.06100,0.05040,0.04010,0.02360,0.055900,0.03180,-0.5900,2.890,3.69e+00,5.47e-02,2.68e-01,1,gtl1_df1-WT
AT1G01020,Distal Columella,9.120,9.820,12.000,9.730,0.05570,0.07170,0.07470,0.06180,0.063400,0.06820,0.1850,3.420,5.04e-01,4.78e-01,8.24e-01,1,gtl1_df1-WT
AT1G01030,Distal Columella,3.420,4.070,3.060,2.430,0.01970,0.03290,0.02000,0.01270,0.026100,0.01640,-0.4530,1.930,1.01e+00,3.15e-01,7.25e-01,1,gtl1_df1-WT
AT1G01040,Distal Columella,2.570,2.870,3.060,2.160,0.01620,0.01940,0.02000,0.01450,0.017700,0.01730,-0.0749,1.680,2.18e-02,8.83e-01,9.73e-01,1,gtl1_df1-WT
AT1G01050,Distal Columella,246.000,308.000,213.000,199.000,0.69300,0.77900,0.63900,0.60000,0.734000,0.62000,-0.4230,7.920,7.31e+00,6.88e-03,5.84e-02,1,gtl1_df1-WT
AT1G01060,Distal Columella,1.710,2.160,0.556,0.811,0.01080,0.01740,0.00364,0.00545,0.014000,0.00455,-1.4600,0.901,2.84e+00,9.18e-02,3.75e-01,1,gtl1_df1-WT
AT1G01070,Distal Columella,0.570,0.240,0.000,0.000,0.00359,0.00194,0.00000,0.00000,0.002800,0.00000,-3.7000,-0.445,1.26e+00,2.62e-01,6.67e-01,1,gtl1_df1-WT
AT1G01080,Distal Columella,0.285,0.240,0.556,0.000,0.00180,0.00194,0.00364,0.00000,0.001860,0.00182,-0.0251,-0.320,1.32e-04,9.91e-01,9.96e-01,1,gtl1_df1-WT
AT1G01090,Distal Columella,37.300,40.000,31.400,20.800,0.19600,0.26400,0.18400,0.12000,0.228000,0.15200,-0.5860,5.040,1.00e+01,1.56e-03,1.77e-02,1,gtl1_df1-WT
AT1G01100,Distal Columella,266.000,237.000,333.000,345.000,0.68000,0.67600,0.72700,0.81300,0.678000,0.77000,0.4340,8.210,7.77e+00,5.32e-03,4.76e-02,1,gtl1_df1-WT


In [24]:
## all genes background 

all_bg <- filter(res_to_write_frq,
                WT.frq >=0.1 | gtl1_df1.frq >=0.1)

In [25]:
length(unique(all_bg$gene))

In [26]:
#total DE genes p_adj.loc < 0.05, abs(logFC) > 1.5
sig_DE <- filter(res_to_write_frq, p_adj.loc<=0.05 & abs(logFC) > log2(1.5))
sig_DE <- left_join(sig_DE, feature_names, by=c("gene"="AGI"))

length(unique(sig_DE$gene))

In [27]:
# filter gene freqs to avoid calling lowly detected genes
sig_DE_fil <- filter(sig_DE, WT.frq >=0.1 | gtl1_df1.frq >=0.1)

In [28]:
length(unique(sig_DE_fil$gene))

In [29]:
# load TFs
TF_list <- read_csv("./data/Kay_TF_thalemine_annotations.csv", col_names = c("gene", "TF_Name", "Description")) 


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  gene = [31mcol_character()[39m,
  TF_Name = [31mcol_character()[39m,
  Description = [31mcol_character()[39m
)




In [30]:
sig_DE_fil <- left_join(sig_DE_fil, TF_list)

Joining, by = "gene"



In [31]:
# label up vs down
sig_DE_fil <- sig_DE_fil %>%
  mutate(up_dn_label = case_when(logFC >=log2(1.5) ~ "Up",  
                                       logFC <=log2(1/1.5) ~ "Down",
                                       TRUE ~ "Not DE"))

sig_DE_fil$clust_up_dn <- paste(sig_DE_fil$cluster_id, sig_DE_fil$up_dn_label, sep="_")

sig_DE_fil

gene,cluster_id,sc_122.cpm,sc_126.cpm,sc_125.cpm,sc_129.cpm,sc_122.frq,sc_126.frq,sc_125.frq,sc_129.frq,⋯,F,p_val,p_adj.loc,p_adj.glb,contrast,Name,TF_Name,Description,up_dn_label,clust_up_dn
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
AT1G01090,Distal Columella,37.3,40.0,31.4,20.8,0.1960,0.2640,0.1840,0.1200,⋯,10.00,1.56e-03,1.77e-02,1.00e+00,gtl1_df1-WT,PDH-E1 ALPHA,,,Down,Distal Columella_Down
AT1G01470,Distal Columella,3000.0,3080.0,1090.0,994.0,0.9660,0.9260,0.6830,0.6040,⋯,97.90,5.01e-23,7.77e-21,2.66e-17,gtl1_df1-WT,LEA14,,,Down,Distal Columella_Down
AT1G01500,Distal Columella,11.1,14.9,25.0,24.9,0.0628,0.1050,0.1550,0.1490,⋯,19.10,1.23e-05,2.70e-04,1.00e+00,gtl1_df1-WT,AT1G01500,,,Up,Distal Columella_Up
AT1G01710,Distal Columella,16.5,16.0,7.5,11.4,0.0969,0.1180,0.0474,0.0745,⋯,10.50,1.20e-03,1.44e-02,1.00e+00,gtl1_df1-WT,AT1G01710,,,Down,Distal Columella_Down
AT1G01820,Distal Columella,43.0,46.5,73.9,70.3,0.2190,0.2970,0.3620,0.3450,⋯,16.60,4.63e-05,8.64e-04,1.00e+00,gtl1_df1-WT,PEX11C,,,Up,Distal Columella_Up
AT1G02360,Distal Columella,37.9,57.0,158.0,175.0,0.1560,0.2330,0.3920,0.3800,⋯,119.00,1.20e-27,2.46e-25,6.40e-22,gtl1_df1-WT,AT1G02360,,,Up,Distal Columella_Up
AT1G02610,Distal Columella,12.3,18.9,25.3,35.7,0.0718,0.1240,0.1350,0.1750,⋯,22.60,1.98e-06,5.14e-05,1.00e+00,gtl1_df1-WT,AT1G02610,,,Up,Distal Columella_Up
AT1G02810,Distal Columella,14.0,10.5,22.5,22.2,0.0754,0.0659,0.1150,0.1090,⋯,15.90,6.84e-05,1.23e-03,1.00e+00,gtl1_df1-WT,PME7,,,Up,Distal Columella_Up
AT1G02860,Distal Columella,37.3,63.7,112.0,158.0,0.1870,0.3310,0.4500,0.5000,⋯,74.90,5.38e-18,5.89e-16,2.86e-12,gtl1_df1-WT,BAH1,,,Up,Distal Columella_Up
AT1G02920,Distal Columella,16.0,82.4,387.0,636.0,0.0395,0.1090,0.2900,0.3580,⋯,399.00,6.04e-88,1.10e-84,3.21e-82,gtl1_df1-WT,GSTF7,,,Up,Distal Columella_Up


In [32]:
sig_DE_fil
write.csv(sig_DE_fil, file = "./output/v4_gtl1_df1_v_WT_cell_time_EdgeR_q0.05_FC1.5_r_v_4_20220121.csv")

gene,cluster_id,sc_122.cpm,sc_126.cpm,sc_125.cpm,sc_129.cpm,sc_122.frq,sc_126.frq,sc_125.frq,sc_129.frq,⋯,F,p_val,p_adj.loc,p_adj.glb,contrast,Name,TF_Name,Description,up_dn_label,clust_up_dn
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
AT1G01090,Distal Columella,37.3,40.0,31.4,20.8,0.1960,0.2640,0.1840,0.1200,⋯,10.00,1.56e-03,1.77e-02,1.00e+00,gtl1_df1-WT,PDH-E1 ALPHA,,,Down,Distal Columella_Down
AT1G01470,Distal Columella,3000.0,3080.0,1090.0,994.0,0.9660,0.9260,0.6830,0.6040,⋯,97.90,5.01e-23,7.77e-21,2.66e-17,gtl1_df1-WT,LEA14,,,Down,Distal Columella_Down
AT1G01500,Distal Columella,11.1,14.9,25.0,24.9,0.0628,0.1050,0.1550,0.1490,⋯,19.10,1.23e-05,2.70e-04,1.00e+00,gtl1_df1-WT,AT1G01500,,,Up,Distal Columella_Up
AT1G01710,Distal Columella,16.5,16.0,7.5,11.4,0.0969,0.1180,0.0474,0.0745,⋯,10.50,1.20e-03,1.44e-02,1.00e+00,gtl1_df1-WT,AT1G01710,,,Down,Distal Columella_Down
AT1G01820,Distal Columella,43.0,46.5,73.9,70.3,0.2190,0.2970,0.3620,0.3450,⋯,16.60,4.63e-05,8.64e-04,1.00e+00,gtl1_df1-WT,PEX11C,,,Up,Distal Columella_Up
AT1G02360,Distal Columella,37.9,57.0,158.0,175.0,0.1560,0.2330,0.3920,0.3800,⋯,119.00,1.20e-27,2.46e-25,6.40e-22,gtl1_df1-WT,AT1G02360,,,Up,Distal Columella_Up
AT1G02610,Distal Columella,12.3,18.9,25.3,35.7,0.0718,0.1240,0.1350,0.1750,⋯,22.60,1.98e-06,5.14e-05,1.00e+00,gtl1_df1-WT,AT1G02610,,,Up,Distal Columella_Up
AT1G02810,Distal Columella,14.0,10.5,22.5,22.2,0.0754,0.0659,0.1150,0.1090,⋯,15.90,6.84e-05,1.23e-03,1.00e+00,gtl1_df1-WT,PME7,,,Up,Distal Columella_Up
AT1G02860,Distal Columella,37.3,63.7,112.0,158.0,0.1870,0.3310,0.4500,0.5000,⋯,74.90,5.38e-18,5.89e-16,2.86e-12,gtl1_df1-WT,BAH1,,,Up,Distal Columella_Up
AT1G02920,Distal Columella,16.0,82.4,387.0,636.0,0.0395,0.1090,0.2900,0.3580,⋯,399.00,6.04e-88,1.10e-84,3.21e-82,gtl1_df1-WT,GSTF7,,,Up,Distal Columella_Up


In [33]:
# add DE and up/dn to total list
sig_to_join <- sig_DE_fil %>%
mutate(clust_gene=paste(cluster_id, gene, sep="_")) %>%
select(clust_gene, up_dn_label, clust_up_dn)

In [34]:
# join all genes list to DE labels
all_bg <- mutate(all_bg, clust_gene=paste(cluster_id, gene, sep="_"))

all_bg <- left_join(all_bg, feature_names, by=c("gene"="AGI"))

all_bg$DE <- all_bg$clust_gene %in% sig_to_join$clust_gene


all_bg <- all_bg %>%
left_join(sig_to_join, by="clust_gene") %>%
arrange(all_bg, p_adj.loc)

write.csv(all_bg, file = "./output/v4_all_gtl1_df1_v_WT_cell_time_EdgeR_q0.05_FC1.5_r_v_4_20220121.csv")