In [1]:
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




# Part 1: create an Excel with the annotation 

In [None]:
# Load main dataset (Table S2)
path_to_annotation_info <- '~/ewce/AnnotationInfo.csv' # <-- PLEASE UPDATE THIS PATH
# st2: Dataframe holding the initial cluster information from Table S2.
st2 <- openxlsx::read.xlsx('./science.adf1226_table_s2.xlsx')
# remove not valid clusters
st2 <- subset(st2, `ClusterID.(PoolClean)` != '--') %>%
    mutate(`ClusterID.(PoolClean)` = as.numeric(`ClusterID.(PoolClean)`) ) %>% arrange(`ClusterID.(PoolClean)`)

# Load external cluster annotation information
AnnoInfo <- read.csv(path_to_annotation_info, row.names = 1)

In [3]:
AnnoInfo

Unnamed: 0_level_0,AnnotationDefinition,AnnotationDescription,AnnotationName
Unnamed: 0_level_1,<chr>,<chr>,<chr>
0,+MPZ,Schwann cell-like (E-SCHWL; +MPZ),E-SCHWL
1,+EYA1 +ISL1,Otic vesicle of the hindbrain (HB-OTV; +EYA1 +ISL1),HB-OTV
2,+NHLH1,Neuroblast (NBL; +NHLH1),NBL
3,+MEIS2 +ISL1 +SIX3,Reticular nucleus of the thalamus (TH-RETN; +MEIS2 +ISL1 +SIX3),TH-RETN
4,+PCP2 +CA8,Purkinje cell (CB-PURK; +PCP2 +CA8),CB-PURK
5,+INA,Neuron (NEUR; +INA),NEUR
6,+HES1,Radial glia (RGL; +HES1),RGL
7,+PDGFRA +OLIG1,Oligodendrocyte precursor (OPC; +PDGFRA +OLIG1),OPC
8,+BCAN +TNC,Glioblast (GBL; +BCAN +TNC),GBL
9,+TTR +CLIC6,choroid plexus (mesoderm) (M-CHRP; +TTR +CLIC6),M-CHRP


In [None]:
# Classify clusters by gene markers (Excitatory/Inhibitory)
lingenes <- list(
    'Excitatory' = c(
        'VGLUT', 'EMX1'
    ),
    'Inhibitory' = c(
        'GAD2', 'DLX'
    )
)

check_cols <- c('PoolEnriched', 'TopLevelEnriched', 'SubsetEnriched')

lin.anno.genes <- st2[,c('ClusterID.(PoolClean)', check_cols)]

paste.coma <- function(...) {paste(..., sep = ',')}

for (lin in names(lingenes)){

    for (col in check_cols){
    
        col.name <- paste(col, lin, sep = '.')
        lin.list <- list()
        
    
        for (g in lingenes[[lin]]){
            
            lin.list[[g]] <- ifelse(grepl(pattern = g, x = lin.anno.genes[,col], fixed = F), g, '')            
            
        }
        

        lin.anno.genes[, col.name] <- do.call(lin.list, what = 'paste.coma') %>%
            gsub(pattern = ',$', fixed=F, replacement = '')%>% 
            gsub(pattern = '^,', fixed=F, replacement = '')
        
        lin.anno.genes[lin.anno.genes[, col.name] == '', col.name] <- NA
        lin.anno.genes[!is.na(lin.anno.genes[, col.name]), col.name] <- paste(
            col, 
            lin.anno.genes[!is.na(lin.anno.genes[, col.name]), col.name],
            sep = ':')
        
    }
}

lin.anno.genes$gene.anno.exc <- apply(lin.anno.genes[,grepl(colnames(lin.anno.genes), pattern = 'Excitatory')], 
                                  MARGIN = 1, FUN = function(x) {
    if (all(is.na(x))){NA}else{paste0('EXCIT>', paste(x[!is.na(x)], collapse = ';'))}})

lin.anno.genes$gene.anno.inh <- apply(lin.anno.genes[,grepl(colnames(lin.anno.genes), pattern = 'Inhibitory')], 
                                  MARGIN = 1, FUN = function(x) {
    if (all(is.na(x))){NA}else{paste0('INHIB>', paste(x[!is.na(x)], collapse = ';'))}})

lin.anno.genes$gene.anno <- apply(lin.anno.genes[,c('gene.anno.exc','gene.anno.inh')], 
                                  MARGIN = 1, FUN = function(x) {
    if (all(is.na(x))){NA}else{paste(x[!is.na(x)], collapse = ' || ')}})

# apply(lin.anno.genes[,-(1:4)], MARGIN = 2, FUN = unique)


In [None]:
# Classify clusters by lineage annotations (Excitatory/Inhibitory)
lineages <- list(
    'Excitatory' = c(
        'NT-VGLUT1', 'NT-VGLUT2', 'NT-VGLUT3', # GLUT
        'P-PALL-M', 'P-PALL' # EMX1
    ),
    'Inhibitory' = c(
        'NT-GABA', # GAD2
        'P-DLGE', 'P-VLGE', 'P-SUBPALL', # DLX2
        'P-MGE-PO' # GAD2
    )
)

lin.annos <- st2[,c('ClusterID.(PoolClean)','NCells','AutoClass','Class','Subclass')]

for (lin in names(lineages)){
    for (ct in lineages[[lin]]){
        
        coincident <- grepl(pattern = ct, x = st2$AutoAnnotation,fixed = T)
        
        lin.annos <- cbind(lin.annos, ifelse(coincident, lin, NA))
        colnames(lin.annos)[ncol(lin.annos)] <- paste(lin, ct, sep='.')
        
    }
}

lin.annos$annot <- apply(lin.annos[,-(1:3)], 1, function(x){
    paste(unique(x[!is.na(x)]), collapse='::')
})

In [7]:
merge(lin.annos[,c(1,ncol(lin.annos))], lin.anno.genes[,-(2:4)], by = 'ClusterID.(PoolClean)') %>%
merge(y = st2, by = 'ClusterID.(PoolClean)') -> annotation.table

In [None]:
# Check self-consistency within lineage and gene annotations
annotation.table$annot.selfconsistent <- !(grepl('Inhibitory', x = annotation.table$annot, fixed=T)&grepl('Excitatory', x = annotation.table$annot, fixed=T))

annotation.table$geneannot.selfconsistent <- !(grepl('INHIB>', x = annotation.table$gene.anno, fixed=T)&grepl('EXCIT>', x = annotation.table$gene.anno, fixed=T))


# Check consistency between lineage and gene annotations
annotation.table$annot.betweenconsistent <- !(grepl('Inhibitory', x = annotation.table$annot, fixed=T)&grepl('EXCIT>', x = annotation.table$gene.anno, fixed=T)) & !(grepl('Excitatory', x = annotation.table$annot, fixed=T)&grepl('INHIB>', x = annotation.table$gene.anno, fixed=T))

# Define overall consistency based on self and between-column checks
annotation.table$consistent.all <- annotation.table$annot.betweenconsistent & annotation.table$annot.selfconsistent & annotation.table$geneannot.selfconsistent

In [9]:
all.cols <- unique(c(colnames(st2), colnames(lin.annos),colnames(lin.anno.genes)))
all.cols <- all.cols[all.cols %in% colnames(annotation.table)]
first.cols <- c('PoolOrder','ClusterID.(PoolClean)','NCells','AutoClass','Class','Subclass', 'consistent.all', 'annot', 'gene.anno', 'annot.selfconsistent', 'geneannot.selfconsistent', 'annot.betweenconsistent')
first.cols <- first.cols[first.cols%in%colnames(annotation.table)]
all.cols <- all.cols[!all.cols %in% first.cols]

annotation.table <- annotation.table[, c(first.cols, all.cols)]

In [10]:
openxlsx::write.xlsx(annotation.table, './cluster_lineage_annotation.xlsx')

In [None]:
# check inconsistencies
subset(annotation.table, !consistent.all)

Unnamed: 0_level_0,PoolOrder,ClusterID.(PoolClean),NCells,AutoClass,Class,Subclass,consistent.all,annot,gene.anno,annot.selfconsistent,⋯,Location.D-V.(Week5),Germinal.zone.(Week5),PoolEnriched.Excitatory,TopLevelEnriched.Excitatory,SubsetEnriched.Excitatory,PoolEnriched.Inhibitory,TopLevelEnriched.Inhibitory,SubsetEnriched.Inhibitory,gene.anno.exc,gene.anno.inh
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<lgl>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
65,459,64,554,Radial glia,Radial glia,Radial glia,False,Radial glia::Excitatory::Inhibitory,,False,⋯,,,,,,,,,,
156,381,155,7013,Radial glia,Radial glia,Radial glia,False,Radial glia::Excitatory::Inhibitory,,False,⋯,,,,,,,,,,
159,378,158,1171,Radial glia,Radial glia,Radial glia,False,Radial glia::Excitatory,"INHIB>PoolEnriched:DLX;SubsetEnriched:GAD2,DLX",True,⋯,,,,,,PoolEnriched:DLX,,"SubsetEnriched:GAD2,DLX",,"INHIB>PoolEnriched:DLX;SubsetEnriched:GAD2,DLX"
168,374,167,2144,Radial glia,Radial glia,Radial glia,False,Radial glia::Excitatory::Inhibitory,,False,⋯,,,,,,,,,,
172,384,171,3988,Glioblast,Glioblast,Glioblast,False,Glioblast::Excitatory,INHIB>SubsetEnriched:DLX,True,⋯,,,,,,,,SubsetEnriched:DLX,,INHIB>SubsetEnriched:DLX
173,385,172,375,Glioblast,Radial glia,Radial glia,False,Radial glia::Excitatory,"INHIB>SubsetEnriched:GAD2,DLX",True,⋯,,,,,,,,"SubsetEnriched:GAD2,DLX",,"INHIB>SubsetEnriched:GAD2,DLX"
206,428,205,3320,Radial glia,Neuron,Neuron,False,Neuron::Excitatory,"INHIB>PoolEnriched:GAD2,DLX;TopLevelEnriched:DLX",True,⋯,,,,,,"PoolEnriched:GAD2,DLX",TopLevelEnriched:DLX,,,"INHIB>PoolEnriched:GAD2,DLX;TopLevelEnriched:DLX"
296,357,295,5479,Neuroblast,Neuroblast,Neuroblast,False,Neuroblast::Excitatory::Inhibitory,,False,⋯,,,,,,,,,,
299,361,298,664,Radial glia,Neuroblast,Neuroblast,False,Neuroblast::Excitatory,"INHIB>SubsetEnriched:GAD2,DLX",True,⋯,,,,,,,,"SubsetEnriched:GAD2,DLX",,"INHIB>SubsetEnriched:GAD2,DLX"
325,331,324,1051,Neuron,Neuron,Neuron,False,Neuron::Excitatory,"INHIB>SubsetEnriched:GAD2,DLX",True,⋯,,,,,,,,"SubsetEnriched:GAD2,DLX",,"INHIB>SubsetEnriched:GAD2,DLX"


# Part 2: Solve inconsistencies manually

In [None]:
# res: Dataframe holding the manually reviewed and commented annotations.
res <- openxlsx::read.xlsx('./cluster_lineage_annotation.commented.xlsx')
head(res)

Unnamed: 0_level_0,PoolOrder,ClusterID.(PoolClean),NCells,AutoClass,Class,Subclass,Diagnotic,consistent.all,comment.anno,annot,⋯,Location.D-V.(Week5),Germinal.zone.(Week5),PoolEnriched.Excitatory,TopLevelEnriched.Excitatory,SubsetEnriched.Excitatory,PoolEnriched.Inhibitory,TopLevelEnriched.Inhibitory,SubsetEnriched.Inhibitory,gene.anno.exc,gene.anno.inh
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,361,298,664,Radial glia,Neuroblast,Neuroblast,ExN,False,"NT-GLUT + GAD2,DLX",Neuroblast::Excitatory,⋯,,,,,,,,"SubsetEnriched:GAD2,DLX",,"INHIB>SubsetEnriched:GAD2,DLX"
2,331,324,1051,Neuron,Neuron,Neuron,ExN,False,"NT-GLUT + GAD2,DLX",Neuron::Excitatory,⋯,,,,,,,,"SubsetEnriched:GAD2,DLX",,"INHIB>SubsetEnriched:GAD2,DLX"
3,297,335,623,Neuroblast,Neuron,Neuron,DUDOSO,False,"NT-GLUT + GAD2,DLX",Neuron::Excitatory::Inhibitory,⋯,,,,,,PoolEnriched:DLX,TopLevelEnriched:DLX,,,INHIB>PoolEnriched:DLX;TopLevelEnriched:DLX
4,29,564,2312,Neuron,Neuron,Neuron,DUDOSO,False,NT-GLUT + GAD2,Neuron::Excitatory,⋯,Ventral,MZ,,,,,,SubsetEnriched:GAD2,,INHIB>SubsetEnriched:GAD2
5,147,402,550,Neuron,Neuron,Neuron,DUDOSO,False,NT-GLUT + DLX,Neuron::Excitatory,⋯,,,,,,,,SubsetEnriched:DLX,,INHIB>SubsetEnriched:DLX
6,269,500,716,Neuron,Neuron,Neuron,DUDOSO,False,NT-GLUT + DLX,Neuron::Excitatory,⋯,,,,,,PoolEnriched:DLX,TopLevelEnriched:DLX,,,INHIB>PoolEnriched:DLX;TopLevelEnriched:DLX


In [13]:
res <- res[,c('ClusterID.(PoolClean)', 'Class', 'Subclass', 'Diagnotic', 'annot', 'gene.anno', 'consistent.all')]

In [None]:
# Combine 'Class' and 'Subclass' columns into 'ClassSubclass'
res$ClassSubclass <- apply(res[,c('Class', 'Subclass')], 1, function(x){paste(unique(x), collapse = ':')})

In [15]:
res$discarded <- (res$Diagnotic == 'DUDOSO') & !is.na(res$Diagnotic)

In [16]:
res$do.InEx <- res$Class %in% c('Neuroblast', 'Neuron', 'Neuronal IPC')
res$InEx <- NA

In [None]:
# Resolve inconsistent cases using 'Diagnotic' column
mask <- (!res$consistent.all)&(!res$discarded)&res$do.InEx
res[mask,'InEx'] <- res[mask,'Diagnotic']
res$InEx <- ifelse(is.na(res$InEx), NA, sapply(res$InEx, substr, start = 1, stop = 2))
# Assign In/Ex based on consistent 'annot' column
mask <- (res$consistent.all)&(!res$discarded)&res$do.InEx
res[mask,'InEx'] <- ifelse(grepl(res[mask,'annot'], pattern = 'Inhibitory', fixed=T), 'In', 'Ex')
# Assign remaining In/Ex based on 'gene.anno' column for consistent cases
mask <- (res$consistent.all)&(!res$discarded)&res$do.InEx&is.na(res$InEx)
res[mask,'InEx'] <- ifelse(grepl(res[mask,'gene.anno'], pattern = 'INHIB', fixed=T), 'In', 'Ex')

[1] 0
[1] 0
[1] 0


In [19]:
short.names <- c(
    'COPs (premyelinating)' = 'COPs',
    'Endothelial' = 'Endo',
    'Erythrocyte' = 'Eryt',
    'Fibroblast' = 'Fibr',
    'Glioblast' = 'GlioBl',
    'Immune' = 'Imm',
    'Neuroblast' = 'NeuBl',
    'Neuron' = 'Neu',
    'Neuronal IPC' = 'NeuIPC',
    'OPC' = 'OPC',
    'Pericytes' = 'Peri',
    'Placodes' = 'Plac',
    'Radial glia' = 'RGC',
    'Schwann' = 'Schwann',
    'VSMC' = 'VSMC')


In [None]:
table(res[,c('do.InEx','InEx')])
table(res[,c('Subclass','InEx')])


# Note: Using 'Subclass' for final naming convention
# Generate long name identifier (In/Ex + Class:Subclass)
res$LongName <- ifelse(
    res$discarded, NA, apply(
        res[,c('InEx', 'ClassSubclass')], 1, 
        function(x){
            paste(x[!is.na(x)], collapse = '')}))
    
# Generate short name identifier (In/Ex + Abbreviated Subclass)
res$ShortName <- ifelse(
    res$discarded, NA, apply(
        res[,c('InEx', 'Subclass')], 1, 
        function(x){
            x[2] <- short.names[x[2]]
            paste(x[!is.na(x)], collapse = '')}))


       InEx
do.InEx  Ex  In
  FALSE   0   0
  TRUE  213 133

                       InEx
Subclass                 Ex  In
  COPs (premyelinating)   0   0
  Endothelial             0   0
  Erythrocyte             0   0
  Fibroblast              0   0
  Glioblast               0   0
  Immune                  0   0
  Neuroblast             95   9
  Neuron                 99 111
  Neuronal IPC           19  13
  OPC                     0   0
  Pericytes               0   0
  Placodes                0   0
  Radial glia             0   0
  Schwann                 0   0
  VSMC                    0   0

In [21]:
openxlsx::write.xlsx(res, './cluster_lineage_annotation.summary.xlsx')