# Plots

In [1]:
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggh4x)
library(ggrepel)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
figsize <- function(x,y){
    options(repr.plot.width=x, repr.plot.height=y)
}

In [None]:
# Define the path to the graphical data file
graphical_data_path <- '/users/genomics/xoel/codebases/cortical_disorders2/data/graphical.rda'
# Load graphical objects (e.g., color palettes) from an R data file (.rda)
load(graphical_data_path, verbose=T)

Loading objects:
  GeneAnnotation
  meta.colors
  facets


In [None]:
# Define base output directories
output_base_dir <- './'
selection_subdir <- 'selection'
selection_dir <- file.path(output_base_dir, selection_subdir)

In [4]:
selection.export <- list()

# Diseases

In [None]:
# Define input and output paths for disease data
# Input: CSV file mapping genes (rows) to diseases (columns)
disease_input_csv_path <- '/users/genomics/xoel/codebases/gene_disease_asociation/parsed_lists_with_nicola.union.csv'
disease_output_csv_path <- '/users//genomics//xoel//Review_CD//1.exp_linnarson//DiseasesForJay.csv'

# Read, process, and write disease data
Diseases <- read.csv(disease_input_csv_path, row.names=1)
DiseaseExclude <- c('IQ_2018')
Diseases <- Diseases[,!colnames(Diseases) %in% DiseaseExclude]
Diseases <- Diseases[rowSums(Diseases) > 0 , ]
Diseases$All <- as.numeric(rowSums(Diseases)>0)

write.csv(Diseases, disease_output_csv_path)
dis.list <- lapply(colnames(Diseases), function(x){rownames(Diseases)[as.logical(Diseases[,x])]}) %>% setNames(nm = colnames(Diseases))
# dis.list %>% lapply(head)

In [None]:
dis.info <- list(
    'All' = c('All'='All'),
    
    'Cortical Malformations' = c(
        'Microcephaly' = 'MIC',
        'Lissencephaly' = 'LIS',
        'Cobblestone'= 'COB', 
        'Heterotopia' = 'HET',
        'Polymicrogyria' = 'POLY', 
        'Hydrocephaly' = 'HYD', 
        'RareMCD' = 'MCD',
        'FCDandmTOR' = 'FCD & mTOR',
        'DevDyslexia' = 'DevDys'
    ),
    'Dev.Delay-ASD' = c(
        'DD' = 'DevDel',
        'SFAR_Synd' = 'SFAR (SYND)',
        'SFAR_S1' = 'SFAR (S1)',
        'SFAR_S2' = 'SFAR (S2)', 
        'SFAR_S3' = 'SFAR (S3)',
        'ASD_HC65' = 'ASD HC65'
    ),
    'GWAS Hits' = c(
        'ASD_2019' = 'ASD 2019', 
        'ADHD_2019' = 'ADHD 2019', 
        'AN_2019' = 'AN 2019',
        'SCZ_2020' = 'SCZ 2020',
        'BD_2019' = 'BD 2019',
        'MDD_2018' = 'MDD 2018',
        'NEUROT_2018' = 'NEUROT 2018',
        'PD_2014' = 'PD 2014',
        'AD_2019' = 'AD 2019'
    )
)

dis.groups <- sapply(names(dis.info), function(x){
    diss <- dis.info[[x]]
    rep(x, length.out = length(diss)) %>% setNames(nm=names(diss))
}) %>% unname() %>% unlist()

dis.levels <- unlist(unname(dis.info))



setNames(names(dis.list) %in% unlist(lapply(dis.info, names)), nm=names(dis.list)) %>% all # TRUE

duplicated(unlist(lapply(dis.info, names))) %>% sum # 0

In [7]:
pal.DiseaseGroup <- setNames(wesanderson::wes_palettes$FantasticFox1[c(3,2,4,5)],
                             c('All','Cortical Malformations','Dev.Delay-ASD','GWAS Hits'))

# Expression in vitro

In [None]:
# Define path for in vitro expression data
# Input: CSV file with log2 RPKM expression values (genes as rows, samples as columns)
invitro_exp_data_path <- '/users/genomics/xoel/codebases/cortical_disorders2/data/nico_expression_log2.original_genes.csv'
# Read in vitro expression data
exp.data <- read.csv(invitro_exp_data_path, row.names=1)

In [None]:
exp.df <- reshape2::melt(exp.data %>% mutate(Gene = rownames(exp.data)), id.vars = 'Gene') %>% rename(Sample = variable)

In [11]:
lapply(names(dis.list), function(dis.name){

    dis.genes <- dis.list[[dis.name]]
    
    # print(sum(dis.genes %in% unique(exp.df$Gene)))
    # print(sum(!dis.genes %in% unique(exp.df$Gene)))
    
    exp.df %>% subset(Gene %in% dis.genes) %>% group_by(Gene) %>%
    summarise(expressed=any(value>1)) %>% ungroup() %>%
    summarise(
        n.exp = sum(expressed),
        # n.lowerthan1 = sum(value <1), 
        n.present = sum(dis.genes%in%Gene),
        n.NOTpresent = sum(!dis.genes%in%Gene),
        Disease = dis.name
    )
    }) %>% do.call(what='rbind') %>% mutate(
        DiseaseLabel = factor(Disease, levels = names(dis.levels), labels = dis.levels),
        DiseaseGroup = plyr::mapvalues(x=Disease, from = names(dis.groups), to = dis.groups) %>% factor(levels = names(dis.info)),
        Dataset = factor('in vitro', 
                         levels = c('in vivo', 'in vitro'),
                         labels = c('in vivo\n(Braun et al)', 'in vitro\n(Micali et al)'))
    ) %>% arrange(DiseaseGroup, DiseaseLabel) -> dis.gene.exp 


In [None]:
# Define output paths for disease info
disease_info_output_path <- '/users//genomics//xoel//Review_CD//1.exp_linnarson//DiseaseInfo.csv'
disease_info_jay_output_path <- '/users//genomics//xoel//Review_CD//1.exp_linnarson//DiseaseInfoForJay.csv'

# save disease info
write.csv(dis.gene.exp, disease_info_output_path)
write.csv(dis.gene.exp[,(4:6)] %>% mutate(index = 1:nrow(dis.gene.exp)), disease_info_jay_output_path)

# Expression in vivo  
Data is collected in a separate script since the dataset is giant and it is easier to manage using python

In [None]:
# Define path for in vivo expression data
# Input: Pre-processed CSV summarizing disease gene expression in vivo (Linnarsson data)
invivo_exp_data_path <- '/users//genomics//xoel//Review_CD//1.exp_linnarson//disease_expression.get5pct.Linnarsson.RadialGlia.FT.FCST.csv'

# Read in vivo expression data
linnarson <- read.csv(
    invivo_exp_data_path,
    row.names = 1) %>% mutate(
        DiseaseLabel = factor(DiseaseLabel, levels = levels(dis.gene.exp$DiseaseLabel)),
        DiseaseGroup = factor(DiseaseGroup, levels = levels(dis.gene.exp$DiseaseGroup)),
        Dataset = factor('in vivo', 
                         levels = c('in vivo', 'in vitro'),
                         labels = c('in vivo\n(Braun et al)', 'in vitro\n(Micali et al)'))
    )
head(linnarson)

Unnamed: 0_level_0,n.exp,n.present,n.NOTpresent,Disease,DiseaseLabel,DiseaseGroup,Dataset
Unnamed: 0_level_1,<int>,<int>,<int>,<chr>,<fct>,<fct>,<fct>
0,171,286,14,NEUROT_2018,NEUROT 2018,GWAS Hits,in vivo (Braun et al)
1,24,35,3,ADHD_2019,ADHD 2019,GWAS Hits,in vivo (Braun et al)
2,39,64,6,MDD_2018,MDD 2018,GWAS Hits,in vivo (Braun et al)
3,137,209,9,BD_2019,BD 2019,GWAS Hits,in vivo (Braun et al)
4,63,98,9,AN_2019,AN 2019,GWAS Hits,in vivo (Braun et al)
5,74,179,13,AD_2019,AD 2019,GWAS Hits,in vivo (Braun et al)


In [14]:
dim(dis.gene.exp)

In [None]:
# Combine in vivo (linnarson) and in vitro (dis.gene.exp) data
dis.gene.exp.combined <- rbind(linnarson, dis.gene.exp)

In [None]:
# Calculate derived metrics on the combined data
dis.gene.exp.combined <- dis.gene.exp.combined %>% mutate(
        EoverP = n.exp/n.present,
        EoverT = n.exp/(n.present+n.NOTpresent),
        NPoverT = n.NOTpresent/(n.present+n.NOTpresent)
)

In [None]:
figsize(8,7)

# Create plots based on the combined data
plist <- list(
    p1 = ggplot(dis.gene.exp.combined, aes(x = EoverP, y = forcats::fct_rev(DiseaseLabel))),
    p2 = ggplot(dis.gene.exp.combined, aes(x = EoverT, y = forcats::fct_rev(DiseaseLabel)))

)

plist <- lapply(plist, function(p){
    p +
    geom_bar(stat='identity', fill='#92A8D1') + 
    facet_grid(rows=vars(DiseaseGroup), cols = vars(Dataset), switch = 'y', scales = 'free_y', space = 'free_y') + 
    theme_light() +
    ggtitle('Expression of disease-risk genes in NSC',
            subtitle = 'Minimum expression in vivo: ≥5% of RG cells\nMinimum expression in vitro: ≥1RPKM') +
    ggpubr::labs_pubr() +
    scale_x_continuous(labels = scales::label_percent(), expand = expansion(mult=c(0,0)), limits = c(0,1)) +
    labs(x = '% expressed genes over', y = 'Disease') +
    theme(
        strip.placement = 'outside',
        strip.text = element_text(color = 'black', angle=0),
        strip.text.y.left = element_text(color = 'black', angle=0),
        strip.background = element_rect(fill='#DDDDDD', color = 'transparent'),
        legend.position = 'left', 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5), 
        panel.grid.major.y = element_blank(), 
        panel.border = element_blank(),
        panel.spacing.y = unit(1, 'mm'),
        panel.spacing.x = unit(4, 'mm')
        )
})

# Define output PDF path
output_pdf_v1 <- file.path(output_base_dir, 'Expression_of_disease_genes.Braun_and_Micali.v1.FT_FCST.pdf')
cairo_pdf(output_pdf_v1, onefile = T, height = 7, width = 8)
plist
dev.off()

selection.export$F1.Supp.Barplot.EoverP <- plist$p2

$p1

$p2


In [None]:
# Reshape the combined data for plotting
dis.gene.stats <- reshape2::melt(dis.gene.exp.combined, id.vars = c('Disease', 'DiseaseLabel',	'DiseaseGroup', 'Dataset'))
head(dis.gene.stats)

Unnamed: 0_level_0,Disease,DiseaseLabel,DiseaseGroup,Dataset,variable,value
Unnamed: 0_level_1,<chr>,<fct>,<fct>,<fct>,<fct>,<dbl>
1,NEUROT_2018,NEUROT 2018,GWAS Hits,in vivo (Braun et al),n.exp,171
2,ADHD_2019,ADHD 2019,GWAS Hits,in vivo (Braun et al),n.exp,24
3,MDD_2018,MDD 2018,GWAS Hits,in vivo (Braun et al),n.exp,39
4,BD_2019,BD 2019,GWAS Hits,in vivo (Braun et al),n.exp,137
5,AN_2019,AN 2019,GWAS Hits,in vivo (Braun et al),n.exp,63
6,AD_2019,AD 2019,GWAS Hits,in vivo (Braun et al),n.exp,74


In [None]:
pdata <- dis.gene.stats %>% subset(variable %in% c('EoverT', 'NPoverT')) %>% 
    mutate(variable = factor(variable, levels = c('NPoverT', 'EoverT'), labels = c('Not in data', 'Expressed')))

p <- ggplot(pdata, aes(x = value, fill = variable, y = forcats::fct_rev(DiseaseLabel))) +
    geom_bar(stat='identity') + 
    scale_fill_manual('Expressed', values = c('lightgrey', '#92A8D1')) +
    facet_grid(rows=vars(DiseaseGroup), cols = vars(Dataset), switch = 'y', scales = 'free_y', space = 'free_y') + 
    # scale_fill_manual(values=meta.colors$PeakSample) +
    theme_light() +
    ggtitle('Expression of disease-risk genes in NSC',
            subtitle = 'Minimum expression in vivo: ≥5% of RG cells\nMinimum expression in vitro: ≥1RPKM') +
    ggpubr::labs_pubr() +
    scale_x_continuous(labels = scales::label_percent(), expand = expansion(mult=c(0,0)), limits = c(0,1)) +
    labs(x = '% expressed genes', y = 'Disease') +
    theme(
        strip.placement = 'outside',
        strip.text = element_text(color = 'black', angle=0),
        strip.text.y.left = element_text(color = 'black', angle=0),
        strip.background = element_rect(fill='#DDDDDD', color = 'transparent'),
        legend.position = 'right', 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5), 
        panel.grid.major.y = element_blank(), 
        panel.border = element_blank(),
        panel.spacing.y = unit(1, 'mm'),
        panel.spacing.x = unit(4, 'mm')
        )
# Define output PDF path
output_pdf_v2 <- file.path(output_base_dir, 'Expression_of_disease_genes.Braun_and_Micali.v2.FT_FCST.pdf')
cairo_pdf(output_pdf_v2, onefile = T, height = 7, width = 10)
p
dev.off()



p <- ggplot(pdata, aes(x = value, fill = DiseaseGroup, alpha = variable, y = forcats::fct_rev(DiseaseLabel))) +
    geom_bar(stat='identity', color = 'darkgrey', linewidth = 0.3) + 
    scale_fill_manual(values = pal.DiseaseGroup) +
    facet_grid(rows=vars(DiseaseGroup), cols = vars(Dataset), switch = 'y', scales = 'free_y', space = 'free_y') + 
    # scale_fill_manual(values=meta.colors$PeakSample) +
    theme_light() +
    ggtitle('Expression of disease-risk genes in NSC',
            subtitle = 'Minimum expression in vivo: ≥5% of RG cells\nMinimum expression in vitro: ≥1RPKM') +
    ggpubr::labs_pubr() +
    scale_x_continuous(labels = scales::label_percent(), expand = expansion(mult=c(0,0)), limits = c(0,1)) +
    labs(x = '% expressed genes', y = 'Disease', alpha = 'Expressed', fill='Disease group') +
    theme(
        strip.placement = 'outside',
        strip.text = element_text(color = 'black', angle=0),
        strip.text.y.left = element_text(color = 'black', angle=0),
        strip.background = element_rect(fill='#DDDDDD', color = 'transparent'),
        legend.position = 'right', 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5), 
        panel.grid.major.y = element_blank(), 
        panel.border = element_blank(),
        panel.spacing.y = unit(1, 'mm'),
        panel.spacing.x = unit(4, 'mm')
        )
# Define output PDF path
output_pdf_v3 <- file.path(output_base_dir, 'Expression_of_disease_genes.Braun_and_Micali.v3.FT_FCST.pdf')
cairo_pdf(output_pdf_v3, onefile = T, height = 7, width = 10)
p
dev.off()


“[1m[22mUsing alpha for a discrete variable is not advised.”


In [20]:
corrdata <- reshape2::dcast(
    dis.gene.stats,
    formula = 'Disease + DiseaseLabel + DiseaseGroup + variable ~ Dataset',
    value.var = 'value')

In [None]:
pdata <- subset(corrdata, variable  == 'EoverP')
p <- ggplot(pdata, 
       aes(x = `in vivo\n(Braun et al)`,
           y = `in vitro\n(Micali et al)`,
           fill = DiseaseGroup, 
           color = DiseaseGroup, 
           label = DiseaseLabel
           )) + 
    theme_light() +
    ggtitle('Expression of disease-risk genes in NSC',
            subtitle = 'Minimum expression in vivo: ≥5% of RG cells\nMinimum expression in vitro: ≥1RPKM') +
    ggpubr::labs_pubr() +
    scale_x_continuous(labels = scales::label_percent(), limits = c(0,1), expand = expansion(mult = c(0,0))) +
    scale_y_continuous(labels = scales::label_percent(), limits = c(0,1), expand = expansion(mult = c(0,0))) +
    # scale_fill_manual(values = c('yellow', 'red', 'blue', 'green')) +
    scale_fill_manual('Disease group', values = pal.DiseaseGroup) +
    scale_color_manual('Disease group', values = pal.DiseaseGroup) +
    # labs(x = '% expressed genes', y = 'Disease') +
    theme(
        strip.placement = 'outside',
        strip.text = element_text(color = 'black', angle=0),
        strip.text.y.left = element_text(color = 'black', angle=0),
        strip.background = element_rect(fill='#DDDDDD', color = 'transparent'),
        legend.position = 'right', 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5), 
        # panel.grid.major.y = element_blank(), 
        # panel.border = element_blank(),
        panel.spacing.y = unit(1, 'mm'),
        panel.spacing.x = unit(4, 'mm')
        ) + 
    coord_equal()


p1 <- p +
    geom_point(stroke = .5, color = 'black', shape = 21, size = 3) + 
    geom_text_repel(color = 'black', min.segment.length = 0, max.overlaps = Inf)
    
p2 <- p +
    geom_text_repel(min.segment.length = 0, max.overlaps = Inf)

p3 <- p +
    geom_smooth(inherit.aes = F, 
                aes(x = `in vivo\n(Braun et al)`,
                    y = `in vitro\n(Micali et al)`),
                method = 'lm', formula = 'y~x', fullrange=TRUE) +
    geom_point() 

p4 <- p3 + geom_text_repel(color = 'black', min.segment.length = 0, max.overlaps = Inf) +
    scale_x_continuous(labels = scales::label_percent(), limits = c(0,1), expand = expansion(mult = c(0,0.1))) +
    scale_y_continuous(labels = scales::label_percent(), limits = c(0,1), expand = expansion(mult = c(0,0.1))) 

# Define output PDF path
output_pdf_subpanel <- file.path(output_base_dir, 'Expression_of_disease_genes.EoverP.Braun_and_Micali.subpanel.FT_FCST.pdf')
cairo_pdf(output_pdf_subpanel, onefile = T, height = 7.5, width = 8.5)

p1 
p2
p3
p4

dev.off()
selection.export$F1.Main.CorrelationPlot.EoverP <- p4

[1m[22mScale for [32mx[39m is already present.
Adding another scale for [32mx[39m, which will replace the existing scale.
[1m[22mScale for [32my[39m is already present.
Adding another scale for [32my[39m, which will replace the existing scale.
“[1m[22mNo shared levels found between `names(values)` of the manual scale and the
data's [32mcolour[39m values.”
“[1m[22mRemoved 7 rows containing missing values or values outside the scale range
(`geom_smooth()`).”
“[1m[22mRemoved 7 rows containing missing values or values outside the scale range
(`geom_smooth()`).”


In [None]:
selection.export

In [None]:
# Create the selection directory if it doesn't exist
dir.create(selection_dir, showWarnings = FALSE, recursive = TRUE)

# Define output PDF paths for selection plots
selection_pdf_main <- file.path(selection_dir, 'F1.Main.CorrelationPlot.EoverP.FT_FCST.pdf')
selection_pdf_supp <- file.path(selection_dir, 'F1.Supp.Barplot.EoverP.FT_FCST.pdf')

# Save main correlation plot
cairo_pdf(selection_pdf_main, onefile = T, height = 7.5, width = 8.5)
plot(selection.export$F1.Main.CorrelationPlot.EoverP)
dev.off()

# Save supplementary bar plot
cairo_pdf(selection_pdf_supp, onefile = T, height = 7, width = 8)
plot(selection.export$F1.Supp.Barplot.EoverP)
dev.off()

“'selection' already exists”
“[1m[22mRemoved 7 rows containing missing values or values outside the scale range
(`geom_smooth()`).”
