### setup

In [None]:
library(ggplot2)
library(viridis)
library(ggpubr)

fig <- function(width, heigth){
 options(repr.plot.width = width, repr.plot.height = heigth)
 }

In [None]:
wd <- '~/codebases/MacBrainDev/'
setwd(wd)
data.dir <- 'data/'
dir.create(fig.dir, showWarnings = F)

n.cores <- as.numeric(Sys.getenv('SLURM_CPUS_PER_TASK'))
n.cores <- if (is.na(n.cores)) parallel::detectCores() else n.cores

disease_lists <- readRDS(paste0(data.dir, 'all_diseases_list.rds'))
disease_lists


base.name <- 'All.MNN.v1.org.fct'

clusters <- read.table('raw/all.nhp.cbn.v6.txt', sep='\t', header=1)
clusters$subtype <- factor(clusters$cluster, levels=unique(clusters$cluster))
clusters$subclass <- factor(clusters$label, levels=unique(clusters$label))
clusters <- data.frame(row.names=clusters$cluster,
                       subclass = factor(clusters$label, levels=unique(clusters$label)),
                       subtype=factor(clusters$cluster, levels=unique(clusters$cluster)))
clusters

In [None]:
### EWCE Results

EWCE_results.fname <- "EWCE_results.20000_reps.dsInf.csv"
#paste0(data.dir, 'EWCE_results.20000_reps.dsInf.csv')
normalizePath(EWCE_results.fname)
results <- read.csv(EWCE_results.fname, row.names=1)

results$`q-value < 0.05` <- results$q < 0.05
results$CellType <- gsub('_', ' ', results$CellType, fixed = T)
results$CellType <- gsub('PAT related', 'PAT-related', results$CellType, fixed = T)

benames <- sort(as.character(clusters$subtype[!clusters$subtype %in% unique(results$CellType)]))
malnames <- sort(unique(results$CellType)[!unique(results$CellType) %in% clusters$subtype & !unique(results$CellType) %in% clusters$subclass ])
mis.names <- setNames(benames,malnames)
results$CellType <- stringr::str_replace_all(results$CellType, mis.names)
results <- results[!apply(is.na(results[,c('p','fold_change','sd_from_mean','q','hit.cells')]), 1, all),]


results$Disease.source <- sapply(strsplit(results$list, '::', fixed=T),
                                 function(x){if (length(x)>1) x[[1]] else ''})
results$Disease.list <- sapply(strsplit(results$list, '::', fixed=T),
                               function(x){if (length(x)>1) x[[2]] else x[[1]]})
results$Disease.group <- ifelse(results$Disease.source == 'MAGMA', 'MAGMA', ifelse(results$Disease.source == 'DISGENET', 'DISGENET', 'ASD + DD + GWAS'))
results$Disease.group <- factor(results$Disease.group, levels=c('ASD + DD + GWAS', 'MAGMA', 'DISGENET'))                     
results$Disease.listname <- ifelse(results$Disease.source=='', results$Disease.list, paste(results$Disease.source, results$Disease.list, sep = '::'))
results <- subset(results, !Disease.list %in% c('mixed gliomas'))
results$Disease.list <- factor(results$Disease.list, levels=sort(unique(results$Disease.list), decreasing=T))

# Export diseases

disease.df <- data.frame(Disease.listname=unique(results$Disease.listname))
disease.df$Disease.group <- sapply(strsplit(disease.df$Disease.listname, split='::', fixed=T), function(x){if (length(x)==2) x[1] else 'GWAS'})
disease.df$Disease.name <- sapply(strsplit(disease.df$Disease.listname, split='::', fixed=T), function(x){x[length(x)]})
disease.df <- disease.df[order(disease.df$Disease.name),]
disease.df <- data.frame(do.call('rbind', split(disease.df, disease.df$Disease.group)[c('GWAS', 'MAGMA', 'DISGENET')]), row.names=NULL)

disease.df
write.csv(disease.df, 'data/DiseaseOrderFinal.csv')
                                                                                                           
                                                                                                           
head(results)
summary(results)

# Heatmap


In [None]:
#### Per type (LEVEL 1)

l1_f_res <- results[results$geneSizeControl == F &  results$annotLevel==1,]
l1_f_res$CellType <- factor(l1_f_res$CellType, levels=levels(clusters$subclass)[levels(clusters$subclass) %in% unique(l1_f_res$CellType)])

fig(10,10)
l1_f_htmp <- ggplot(l1_f_res, aes(x=CellType, y=Disease.list,
                                  fill=sd_from_mean,
                                  alpha=`q-value < 0.05`
                                 )) + 
    geom_tile(color='darkgrey') + 
    facet_grid(rows='Disease.group', space='free', scales='free') + 
    scale_alpha_manual(values=c('TRUE'=1, 'FALSE'=0)) + 
    labs(y= "Disease", x = "Cell type", fill='SD from mean') +
    scale_fill_viridis(limits = c(min(ifelse(l1_f_res$`q-value < 0.05`, l1_f_res$sd_from_mean, NA), na.rm=T),NA)) + 
    theme_pubr(base_family='Arial', legend = 'right') + labs_pubr() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
    ggtitle(label = 'EWCE results of cell types', subtitle='NOT controlling for GC content and gene length') +
    theme(
      panel.background = element_rect(fill = "#F2F2F2")) # , panel.grid.major = element_line(size = 0.5, linetype = 'solid', colour = "darkgrey"))


fig(10,10)
l1_f_htmp
ggsave(l1_f_htmp, 
       filename =  'EWCE results of cell types - Not controlled.pdf', 
       width = 10, height = 10, useDingbats=FALSE)



#### Per subtype (LEVEL 2)

l2_f_res <- results[results$geneSizeControl == F &  results$annotLevel==2,]
l2_f_res$CellType <- factor(l2_f_res$CellType, levels=levels(clusters$subtype)[levels(clusters$subtype) %in% unique(l2_f_res$CellType)])
l2_f_res$Type <- clusters[ as.character(l2_f_res$CellType),'subclass']

fig(25,13)
l2_f_htmp <- ggplot(l2_f_res, aes(x=CellType, y=Disease.list, fill=sd_from_mean, alpha=`q-value < 0.05`)) + 
    geom_tile(color='darkgrey') + 
    scale_alpha_manual(values=c('TRUE'=1, 'FALSE'=0)) + 
    facet_grid(rows=vars(Disease.group), cols=vars(Type), space='free', scales='free') + 
    labs(y= "Disease", x = "Cell subtype", fill='SD from mean') +
    scale_fill_viridis(limits = c(min(ifelse(l2_f_res$`q-value < 0.05`, l2_f_res$sd_from_mean, NA), na.rm=T),NA)) + 
    theme_pubr(base_family='Arial', legend = 'bottom') + labs_pubr() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
          strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0.5)) +
    ggtitle(label = 'EWCE results of cell subtypes', subtitle='NOT controlling for GC content and gene length') +
    theme(
      panel.background = element_rect(fill = "#F2F2F2")) # , panel.grid.major = element_line(size = 0.5, linetype = 'solid', colour = "darkgrey"))


fig(26,13)
l2_f_htmp
ggsave(l2_f_htmp, 
       filename = 'EWCE results of cell subtypes - Not controlled.pdf', width = 26, height = 15, useDingbats=FALSE)

# manhattan


In [None]:
## Manhattan plots for EWCE results
library(dplyr)
library(ggplot2)

ewce <- read.csv(file = "EWCE_results.20000_reps.dsInf.csv") %>%
            mutate(CellType = gsub("_", " ", CellType)) %>%
            filter(annotLevel == 2L & geneSizeControl == FALSE) %>%
            mutate(CellType = gsub("NKX2 1", "NKX2-1", CellType)) %>%
            mutate(CellType = gsub("NKX6 2", "NKX6-2", CellType)) %>%
            mutate(CellType = gsub("PAT related", "PAT-related", CellType)) %>%
            mutate(mlogp = -log10(q)) %>%
            filter(list != "DISGENET::mixed gliomas")
ewce$mlogp[ewce$mlogp > 5] <- 5

In [None]:
## Load cell class information
ord_tb <- read.table("raw/all.nhp.cbn.v6.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE) %>%
                mutate(cluster = gsub("_", " ", cluster)) %>%
                mutate(cluster = gsub("DLX1 ASCL1", "ASCL1 DLX1", cluster)) %>%
                mutate(cluster = gsub("InN ASCL1", "IPC ASCL1", cluster)) %>%
                filter(label != "remove") %>%
                mutate(label = gsub("Mesenchymal", "Meschy", label)) # %>%
                # mutate(label = gsub('nIPC', "enIPC", label, fixed=T)) %>%
                # mutate(label = gsub('InN IPC', "inIPC", label)) %>%
                # mutate(label = gsub('^InN$', "Inhibitory neurons", label)) %>%
                # mutate(label = gsub('Vascular', "RB&Vascular", label)) %>%
                # mutate(label = gsub('^CR$', "Cajal Retzius cells", label)) %>%
                # mutate(label = gsub("Others", "Early clusters", label))
unique(ord_tb$label)

In [None]:
ewce$class <- ord_tb$label[match(ewce$CellType, ord_tb$cluster)]
# ewce$label <- ifelse(ewce$mlogp >= -log10(0.05), ewce$CellType, NA)
ewce$label <- ifelse(ewce$q <= 0.05, ewce$CellType, NA)
ewce$disease <- ewce$list
ewce$size <- ifelse(ewce$mlogp >= -log10(0.05), 2.5, 1)

In [None]:
## This is the colors we used in MF1
group_colors.sign <- c(`Patterning centers` = "#821f44", 
					`dorsal NSC` = "#f573ee",
					enIPC = "#7ca4f9",
					`Excitatory neurons` = "#2166ac",
					`CR` = "#bccf42",
					`GE NSC` = "#f1b6da",
					inIPC = "#7fe63e",
					`Inhibitory neurons` = "#0e9c23",
					gIPC = "#ffc277",
					Astro = "#e08214",
					`OPC&Oligo` = "#ad630a",
					Mes = "#6aada3",
					Immune = "#7a7878",
					`RB&Vas` = "#525759",
					`PAT-related subtypes` = "#fa3980")

## For the disease main figure (MF7-A), I changed these non-significant subclasses to "lightgrey(#D3D3D3)"
## Also for better contrasting, I changed the colors of "Immune" & "CR(Cajal Retzius cells)"
## In case that some "lightgrey" cell types have significant scores in the updated dataset, maybe you can update their colors based on the above color codes.
group_colors <- c(`Patterning centers` = "#D3D3D3", 
					`dorsal NSC` = "#D3D3D3",
					enIPC = "#7ca4f9",
					`Excitatory neurons` = "#2166ac",
					`CR` = "#AF9AE2",
					`GE NSC` = "#D3D3D3",
					inIPC = "#7fe63e",
					`Inhibitory neurons` = "#0e9c23",
					gIPC = "#ffc277",
					Astro = "#e08214",
					`OPC&Oligo` = "#ad630a",
					Mes = "#D3D3D3",
					Immune = "#e05085",
					`RB&Vas` = "#D3D3D3",
					`PAT-related subtypes` = "#D3D3D3")

In [None]:
setNames(ifelse(names(group_colors) %in%unique(ewce$class[!is.na(ewce$label)]),
                group_colors.sign[names(group_colors)],
                '#D3D3D3'),
                # group_colors),
         names(group_colors)) -> group_colors
group_colors

In [None]:
## Define the disease order
dis_df <- disease.df
dis_ord <- dis_df$Disease.listname

ewce$category <- strsplit(as.character(ewce$disease), "::", fixed = TRUE) %>%
                    sapply(., "[[", 1)
ewce$category[!grepl("DISGENET|MAGMA", ewce$category)] <- "SFARI + GWAS"
ewce$category <- factor(ewce$category, levels = c("SFARI + GWAS", "MAGMA", "DISGENET"))
ewce$disease <- factor(as.character(ewce$disease), levels = dis_ord)
p <- ggplot(ewce, mapping = aes(x = disease, y = mlogp, label = label, color = class)) +
            geom_jitter(aes(size = size), shape = 16, alpha = 0.9, position = position_jitter(seed = 1, width = 0.2)) +
            scale_size_identity()+
            scale_color_manual(values = group_colors) +
            theme_classic() +
            scale_x_discrete(breaks = dis_ord, labels = strsplit(dis_ord, "::", fixed = TRUE) %>% sapply(., function(x) rev(x)[1])) +
            facet_grid(cols = vars(category), scales = "free_x", space = "free_x") +
            theme(legend.position = "bottom", 
                
                axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = rel(1.2)),
                axis.line = element_line(size = 0.2), 
                axis.ticks = element_line(size = 0.2))
                                                                                                         

In [None]:
pdf("EWCE_Manhattan_v4.pdf", 15, 6.4, useDingbats = FALSE)
print(p)
dev.off()

In [None]:
fig(10,10)
p + geom_hline(yintercept = -log10(0.05))