In [1]:
wd <- '~/codebases/MacBrainDev/'
setwd(wd)
dir.create('raw')
dir.create('data')

“'raw' already exists”
“'data' already exists”


# Diseases

## Create table of disease-gene associations

### Look-up diseases in directory

In [2]:
diseases_dir <- 'raw/Disease_genes/'
dir(diseases_dir)

### DISGENET

Diseases to include,

In [5]:
disg.disnames.fname <- paste0('raw/Disgenet_forXoel.txt')
disg.disnames <- read.table(disg.disnames.fname, sep='\t', col.names=c('diseaseName', 'include'))

disgenet_diseases <- disg.disnames$diseaseName[as.logical(disg.disnames$include)]

Disgenet table

In [6]:
disgenet.fname <- paste0(diseases_dir, 'curated_gene_disease_associations.tsv')
disgenet.min_genes <- 30

disgenet <- read.csv(disgenet.fname, sep='\t', row.names=NULL)
disgenet[disgenet$diseaseName %in% c('Mixed oligoastrocytoma', 'oligodendroglioma'),'diseaseName'] <- 'M.Oligoastr+Oligodendrogliomas'

disgenet <- disgenet[disgenet$diseaseName %in% c(disgenet_diseases, 'M.Oligoastr+Oligodendrogliomas'),]

disgenet_genes <- lapply(split(disgenet$geneSymbol, disgenet$diseaseName), unique)
names(disgenet_genes) <- paste0('DISGENET::', names(disgenet_genes))

Meduloblastomas 

In [7]:
# Condense medulloblastomas
medullo <- unique(unlist(disgenet_genes[grep('(M|m)edullo', names(disgenet_genes), fixed=F, value=T)]))
disgenet_genes[grep('(M|m)edullo', names(disgenet_genes), fixed=F, value=T)] <- NULL
disgenet_genes[['DISGENET::Medulloblastomas']] <- medullo

Minimum number of genes

In [8]:
disgenet_genes <- disgenet_genes[sapply(disgenet_genes, length) > disgenet.min_genes]

Remove mixed gliomas

In [9]:
# Remove mixed gliomas
disgenet_genes['DISGENET::mixed gliomas'] <- NULL

### All other tables

In [10]:
dis_dfs <- setNames(lapply(grep('.txt', dir(diseases_dir), invert=F, value=T), function(x){read.table(paste0(diseases_dir,x))}),
                    sapply(grep('.txt', dir(diseases_dir), invert=F, value=T), tools::file_path_sans_ext))

dis_genes <- list()

for (dis in names(dis_dfs)){
    if (dis %in% c('ASD_CANCER','SFARI_DEVELOPMENT')){
        next
    }

    if (ncol(dis_dfs[[dis]]) > 1){
        colnames(dis_dfs[[dis]]) <- dis_dfs[[dis]][1,]
        dis_dfs[[dis]]<- dis_dfs[[dis]][-1,]
    } else {
        dis_genes[[dis]] <- as.character(dis_dfs[[dis]][,1])
        dis_dfs[[dis]] <- NULL
    }
}

DD

In [11]:
names(dis_genes) <- gsub('DD_genes', 'DD', names(dis_genes), fixed=T)

GWAS

In [12]:
dis_dfs[['GWAS_genes']] <- dis_dfs[['GWAS_genes']][order(dis_dfs[['GWAS_genes']]$Dataset),]
gwas_genes <- split(dis_dfs[['GWAS_genes']]$Gene, paste('GWAS', dis_dfs[['GWAS_genes']]$Dataset,
                                                        sep='::'))
dis_dfs[['GWAS_genes']] <- NULL

MAGMA

In [13]:
rownames(dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']]) <- dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']]$GENE_SYMBOL
dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']]$GENE_SYMBOL <- NULL
dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']]$GENE <- NULL
colnames(dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']]) <- paste('MAGMA', 
                                                                  colnames(dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']]),
                                                                  sep='::')

In [14]:
max.pval <- 0.05
top.magma <- 200

In [15]:
dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']] <- as.data.frame(apply(
    dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']], 
    2,
    as.numeric
    ), row.names=rownames(dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']]))

In [16]:
magma_genes <- lapply(
    dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']], 
    function(x){
        x<-setNames(x, rownames(dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']]))
        x <- x[order(x, na.last = TRUE, decreasing = FALSE)]
        x <- x[x < max.pval & !is.na(x)]
        head(names(x), min(length(x), top.magma))
    }
)

In [17]:
dis_dfs[['AllDiseasesMAGMA_GeneLevel_181220']] <- NULL

### Merge lists

In [18]:
dis_genes <- dis_genes[sort(names(dis_genes))]
disgenet_genes <- disgenet_genes[sort(names(disgenet_genes))]
gwas_genes <- gwas_genes[sort(names(gwas_genes))]
magma_genes <- magma_genes[sort(names(magma_genes))]

In [19]:
print('### SFARI + ASD + DD')
print(names(dis_genes))
print('### DISGENET')
print(names(disgenet_genes))
print('### GWAS')
print(names(gwas_genes))
print('### MAGMA')
print(names(magma_genes))

[1] "### SFARI + ASD + DD"
[1] "ASD_HC65"        "DD"              "SFARI_Score2"    "SFARI_Score3"   
[5] "SFARI_Syndromic"
[1] "### DISGENET"
[1] "DISGENET::Giant Cell Glioblastoma"       
[2] "DISGENET::Glioblastoma"                  
[3] "DISGENET::Glioblastoma Multiforme"       
[4] "DISGENET::Glioma"                        
[5] "DISGENET::M.Oligoastr+Oligodendrogliomas"
[6] "DISGENET::Malignant Glioma"              
[7] "DISGENET::Medulloblastomas"              
[8] "DISGENET::Neuroblastoma"                 
[1] "### GWAS"
 [1] "GWAS::AD_2019"     "GWAS::ADHD_2019"   "GWAS::AN_2019"    
 [4] "GWAS::ASD_2019"    "GWAS::BD_2019"     "GWAS::IQ_2018"    
 [7] "GWAS::MDD_2018"    "GWAS::NEUROT_2018" "GWAS::PD_2014"    
[10] "GWAS::SCZ_2020"   
[1] "### MAGMA"
 [1] "MAGMA::AD_2019"     "MAGMA::ADHD_2019"   "MAGMA::AN_2019"    
 [4] "MAGMA::ASD_2019"    "MAGMA::BD_2019"     "MAGMA::IQ_2018"    
 [7] "MAGMA::MDD_2018"    "MAGMA::NEUROT_2018" "MAGMA::OCD_2018"   
[10] "MAGMA::PD_2014"    

In [20]:
all_diseases <- c(dis_genes, gwas_genes, magma_genes, disgenet_genes)
names(all_diseases) <- gsub('_', ' ', names(all_diseases), fixed=T)



print('All')
print(names(all_diseases))

[1] "All"
 [1] "ASD HC65"                                
 [2] "DD"                                      
 [3] "SFARI Score2"                            
 [4] "SFARI Score3"                            
 [5] "SFARI Syndromic"                         
 [6] "GWAS::AD 2019"                           
 [7] "GWAS::ADHD 2019"                         
 [8] "GWAS::AN 2019"                           
 [9] "GWAS::ASD 2019"                          
[10] "GWAS::BD 2019"                           
[11] "GWAS::IQ 2018"                           
[12] "GWAS::MDD 2018"                          
[13] "GWAS::NEUROT 2018"                       
[14] "GWAS::PD 2014"                           
[15] "GWAS::SCZ 2020"                          
[16] "MAGMA::AD 2019"                          
[17] "MAGMA::ADHD 2019"                        
[18] "MAGMA::AN 2019"                          
[19] "MAGMA::ASD 2019"                         
[20] "MAGMA::BD 2019"                          
[21] "MAGMA::IQ 2018"         

In [21]:
saveRDS(all_diseases, 'data/all_diseases_list.rds')

In [22]:
sapply(all_diseases, head)

ASD HC65,DD,SFARI Score2,SFARI Score3,SFARI Syndromic,GWAS::AD 2019,GWAS::ADHD 2019,GWAS::AN 2019,GWAS::ASD 2019,GWAS::BD 2019,⋯,MAGMA::SCZ 2020,MAGMA::TS 2019,DISGENET::Giant Cell Glioblastoma,DISGENET::Glioblastoma,DISGENET::Glioblastoma Multiforme,DISGENET::Glioma,DISGENET::M.Oligoastr+Oligodendrogliomas,DISGENET::Malignant Glioma,DISGENET::Medulloblastomas,DISGENET::Neuroblastoma
CHD8,ARID1B,ACHE,ABAT,ACTB,PPOX,ARTN,SEMA3F,C8orf74,AC003102.1,⋯,DPYD,FLT3,JAG1,JAG1,JAG1,ACVR1,ACVR1,ACVR1,APC,ALK
SCN2A,ANKRD11,ADA,ABCA10,ACTL6B,B4GALT3,ATP6V0B,RBM5,CADPS,ACOT12,⋯,CACNA1C,BCL7A,APC,APC,APC,ALK,AR,APOD,ATOH1,ATRX
ARID1B,DDX3X,ADCY3,ABCA13,ACY1,ADAMTS4,ATP6V1E1P1,RBM6,KIZ,ADAMTSL3,⋯,PPP1R16B,CFAP99,BCHE,BCHE,ATRX,APC,BRAF,ATP1B2,BRCA2,BARD1
NRXN1,ADNP,AGAP2,ABCA7,AHI1,NDUFS2,B4GALT2,ZMYND10,KMT2E,ADAMTSL4,⋯,ARL6IP4,GABRG1,BMI1,BMI1,BCHE,APOD,DTX1,ATRX,CCNE1,BCHE
SYNGAP1,KMT2A,AGO1,ACE,ALDH1A3,FCER1G,CCDC24,CACNA2D2,LOC102723661,ADAMTSL4-AS1,⋯,CACNB2,TMEM236,RUNX1,RUNX1,BMI1,ATM,ESR1,BMI1,CDK6,CDKN2A
DYRK1A,SYNGAP1,AGO4,ACTN4,ANKS1B,APOA2,DUSP6,CELSR3,MACROD2,ADCK2,⋯,MAD1L1,LMO3,RUNX3,RUNX3,RUNX1,ATP1B2,ESR2,BRAF,CTNNB1,DBH


## Order diseases

### Clustering

In [23]:
order_diss <- function(dis.lists){
    all.dis.genes <- unique(unlist(dis.lists))
    dis.df <- t(data.frame(row.names=all.dis.genes, 
                           lapply(dis.lists, function(x){as.numeric(all.dis.genes %in% x)})))
    eq_cols <- setNames(names(dis.lists), rownames(dis.df))
    hc <- hclust(dist(dis.df, method = 'binary'))
    return(dis.lists[eq_cols[hc$labels[hc$order]]])
}

In [24]:
clustered_diseases <- order_diss(all_diseases)

In [25]:
subclustered_diseases <- c(
    order_diss(dis_genes),
    order_diss(gwas_genes),
    order_diss(magma_genes),
    order_diss(disgenet_genes))

names(subclustered_diseases) <- gsub('_', ' ', names(subclustered_diseases), fixed=T)

### Groups and palette

In [26]:
dis.groups <- c(setNames(rep('SFARI + GWAS', length(names(c(dis_genes, gwas_genes)))),
         names(c(dis_genes, gwas_genes))),
                setNames(rep('MAGMA', length(names(magma_genes))),
                         names(magma_genes)),
                setNames(rep('DISGENET', length(names(disgenet_genes))),
                         names(disgenet_genes)))

In [27]:
dis.class <- c("ASD HC65" = 'ASD',
"DD"='PSY',
"SFARI Score1"='ASD', 
"SFARI Score2"='ASD', 
"SFARI Score3"='ASD', 
"SFARI Syndromic"='ASD',
"GWAS::ADHD 2019"='PSY', 
"GWAS::AD 2019"='PSY', 
"GWAS::AN 2019"='PSY', 
"GWAS::ASD 2019"='ASD',
"GWAS::BD 2019"='NDD',
"GWAS::IQ 2018"='PSY',
"GWAS::MDD 2018"='PSY',
"GWAS::NEUROT 2018"='PSY',
"GWAS::PD 2014"='NDD',
"GWAS::SCZ 2020"='PSY',
"MAGMA::ASD 2019"='ASD',
"MAGMA::ADHD 2019"='PSY',
"MAGMA::SCZ 2020"='PSY',
"MAGMA::BD 2019"='NDD',
"MAGMA::OCD 2018"='PSY',
"MAGMA::AN 2019"='PSY',
"MAGMA::TS 2019"='PSY',
"MAGMA::MDD 2018"='PSY',
"MAGMA::NEUROT 2018"='PSY',
"MAGMA::IQ 2018"='PSY',
"MAGMA::AD 2019"='PSY',
"MAGMA::PD 2014"='PSY',
"DISGENET::Giant Cell Glioblastoma"='CANCER 1',
"DISGENET::Glioblastoma"='CANCER 1',
"DISGENET::Glioblastoma Multiforme"='CANCER 1',
"DISGENET::Glioma"='CANCER 1',
"DISGENET::M.Oligoastr+Oligodendrogliomas"='CANCER 2',
"DISGENET::Malignant Glioma"='CANCER 1',
"DISGENET::Neuroblastoma"='CANCER 2',
"DISGENET::mixed gliomas"='CANCER 1',
"DISGENET::Medulloblastomas"='CANCER 2')

library(RColorBrewer)
class.cols <- setNames(brewer.pal(n = length(unique(dis.class)), name='Accent'), unique(dis.class))
dis.class.cols <- setNames(class.cols[dis.class], names(dis.class))


dis.class.cols <- dis.class.cols[unlist(unname(split(names(dis.class.cols), dis.class.cols)))]

In [28]:
DiseasesOrdered <- list(
    alphabetical=all_diseases,
    clustered_together=clustered_diseases,
    clustered_bygroup=subclustered_diseases,
    groups=dis.groups,
    class=dis.class,
    class.colors=class.cols,
    dis.class.colors=dis.class.cols
)

In [29]:
saveRDS(DiseasesOrdered, 'data/DiseaseListsInfo.rds')

# RNA Assay

---
title: Data description
author: Shaojie Ma
date: August 31, 2022
---

## Updated seurat object


### Object name: All.MNN.v1.org.rds
```R
## Cell class
object@meta.data$subclass

## Cell subtype
object@meta.data$subtype

## Age
object@meta.data$cbnage

## Region information
object@meta.data$lobe
```


### Subtype order
Subtype order is stored in this file: all.nhp.cbn.v6.txt
You will just need the "cluster" column and the current order is the cell type order.


### Subtype Colors (for MF7-A)
```R
## This is the colors we used in MF1
group_colors <- c(`Patterning centers` = "#821f44", 
					`dorsal NSC` = "#f573ee",
					enIPC = "#7ca4f9",
					`Excitatory neurons` = "#2166ac",
					`CR` = "#bccf42",
					`GE NSC` = "#f1b6da",
					inIPC = "#7fe63e",
					`Inhibitory neurons` = "#0e9c23",
					gIPC = "#ffc277",
					Astro = "#e08214",
					`OPC&Oligo` = "#ad630a",
					Mes = "#6aada3",
					Immune = "#7a7878",
					`RB&Vas` = "#525759",
					`PAT-related subtypes` = "#fa3980")

## For the disease main figure (MF7-A), I changed these non-significant subclasses to "lightgrey(#D3D3D3)"
## Also for better contrasting, I changed the colors of "Immune" & "CR(Cajal Retzius cells)"
## In case that some "lightgrey" cell types have significant scores in the updated dataset, maybe you can update their colors based on the above color codes.
group_colors <- c(`Patterning centers` = "#D3D3D3", 
					`dorsal NSC` = "#D3D3D3",
					enIPC = "#7ca4f9",
					`Excitatory neurons` = "#2166ac",
					`CR` = "#AF9AE2",
					`GE NSC` = "#D3D3D3",
					inIPC = "#7fe63e",
					`Inhibitory neurons` = "#0e9c23",
					gIPC = "#ffc277",
					Astro = "#e08214",
					`OPC&Oligo` = "#ad630a",
					Mes = "#D3D3D3",
					Immune = "#e05085",
					`RB&Vas` = "#D3D3D3",
					`PAT-related subtypes` = "#D3D3D3")
```





In [28]:
celltype.info <- data.table::fread('raw/all.nhp.cbn.v6.txt')
celltype.info

cluster,subclass,label,sample
<chr>,<chr>,<chr>,<chr>
PC FGF17,rostral,Patterning centers,all
PC SFRP1,rostral,Patterning centers,all
PC NKX2-1 LMO1,rostral,Patterning centers,all
PC NKX2-1 NKX6-2,rostral,Patterning centers,all
GE RG NKX2-1 DLK1,ventral,Patterning centers,all
GE RG NKX2-1 OLIG1,ventral,Patterning centers,all
PC NKX2-1 RAX,caudal,Patterning centers,all
PC RSPO3,caudal,Patterning centers,all
PC TTR,caudal,Patterning centers,all
PC SFRP2,caudal,Patterning centers,all


In [29]:
object <- readRDS('raw/All.MNN.v1.org.rds')

In [30]:
object

Loading required package: Seurat

The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, will retire in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
The sp package is now running under evolution status 2
     (status 2 uses the sf package in place of rgdal)

Attaching SeuratObject



An object of class Seurat 
34619 features across 761529 samples within 1 assay 
Active assay: RNA (34619 features, 0 variable features)
 2 dimensional reductions calculated: mnn, umap

In [31]:
## Cell class
str(object@meta.data$subclass)

## Cell subtype
str(object@meta.data$subtype)

## Age
str(object@meta.data$cbnage)

## Region information
str(object@meta.data$lobe)


 chr [1:761529] "PAT-related subtypes" "dorsal NSC" "CR" ...
 chr [1:761529] "Cls FGF17 LGI1" "RGC FABP7 PMP22" "IPC RSPO3 NHLH1" ...
 chr [1:761529] "E37" "E37" "E37" "E37" "E37" "E37" "E37" "E37" "E37" "E37" ...
 chr [1:761529] "FC" "FC" "FC" "FC" "FC" "FC" "FC" "FC" "FC" "FC" "FC" "FC" ...


In object, subclass refers to label in the annotation, which is the subclass label.
In object, subtype refers to cluster in the annotation.

In [32]:
all(unique(object$subclass) %in% unique(celltype.info$label))

In [33]:
all(unique(object$subtype) %in% unique(celltype.info$cluster))

In [34]:
unique(object$subclass)
unique(celltype.info$label)

In [35]:
unique(object$subtype)
unique(celltype.info$cluster)

In [36]:
object$subclass <- factor(object$subclass,
                          levels=unique(celltype.info$label))

In [37]:
object$subtype <- factor(object$subtype,
                         unique(celltype.info$cluster))

In [38]:
saveRDS(object = object, 
        file = 'data/All.MNN.v1.org.fct.rds')

In [39]:
file.exists('data/All.MNN.v1.org.fct.rds')