## Integration of early male and female samples (<= 10 PCW)

In [None]:
suppressPackageStartupMessages(library(ArchR))
library(argparse)
library(ArchR)
library(parallel)

### 1. Create ArchR Project file from Arrow files generated with nextflow pipeline

In [None]:
addArchRGenome("hg38")

In [None]:
packageVersion("matrixStats")

#### Save sample-level metadata that comes from analysing each sample independently 

In [None]:
data_dir <- "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/" 

In [None]:
HD_F_GON13254251_meta <- read.csv(paste0(data_dir, "HD_F_GON13254251/HD_F_GON13254251_metadata.csv"), row.names = 'X')
dim(HD_F_GON13254251_meta)

In [None]:
HD_F_GON13254249_meta <- read.csv(paste0(data_dir, "HD_F_GON13254249/HD_F_GON13254249_metadata.csv"), row.names = 'X')
print(dim(HD_F_GON13254249_meta))

In [None]:
head(row.names(HD_F_GON13254249_meta))

In [None]:
colnames(HD_F_GON13254249_meta)

In [None]:
HD_F_GON12877981_meta <- read.csv(paste0(data_dir, "HD_F_GON12877981/HD_F_GON12877981_metadata.csv"), row.names = 'X')
print(dim(HD_F_GON12877981_meta))

In [None]:
HD_F_GON13254253_meta <- read.csv(paste0(data_dir, "HD_F_GON13254253/HD_F_GON13254253_metadata.csv"), row.names = 'X')
print(dim(HD_F_GON13254253_meta))

In [None]:
HD_F_GON13654400_meta <- read.csv(paste0(data_dir, "HD_F_GON13654400/HD_F_GON13654400_metadata.csv"), row.names = 'X')
print(dim(HD_F_GON13654400_meta))


In [None]:
HD_F_GON14609875_meta <- read.csv(paste0(data_dir, "HD_F_GON14609875/HD_F_GON14609875_metadata.csv"), row.names = 'X')
print(dim(HD_F_GON14609875_meta))


In [None]:
HD_F_GON14524656_meta <- read.csv(paste0(data_dir, "HD_F_GON14524656/HD_F_GON14524656_metadata.csv"), row.names = 'X')
print(dim(HD_F_GON14609875_meta))

In [None]:
data_dir

In [None]:
HD_F_GON13654401_meta <- read.csv(paste0(data_dir, "HD_F_GON13654401/HD_F_GON13654401_metadata.csv"), row.names = 'X')
print(dim(HD_F_GON13654401_meta))

In [None]:
head(row.names(HD_F_GON13654401_meta))

In [None]:
colnames(HD_F_GON13654401_meta)

In [None]:
HD_F_GON13149227_and_HD_F_GON13188002_meta <- read.csv(paste0(data_dir, "HD_F_GON13149227_and_HD_F_GON13188002/HD_F_GON13149227_and_HD_F_GON13188002_metadata.csv"), row.names = 'X')
print(dim(HD_F_GON13149227_and_HD_F_GON13188002_meta))

In [None]:
colnames(HD_F_GON13149227_and_HD_F_GON13188002_meta)

In [None]:
HD_F_GON12296727_and_HD_F_GON12220625_meta <- read.csv(paste0(data_dir, "HD_F_GON12296727_and_HD_F_GON12220625/HD_F_GON12296727_and_HD_F_GON12220625_metadata.csv"),
                                                             row.names = 'X')
print(dim(HD_F_GON12296727_and_HD_F_GON12220625_meta))


In [None]:
colnames(HD_F_GON12296727_and_HD_F_GON12220625_meta)

In [None]:
metadata <- do.call("rbind", list(HD_F_GON13254249_meta, HD_F_GON13254251_meta, HD_F_GON13654401_meta, 
                                  HD_F_GON14524656_meta, HD_F_GON14609875_meta, HD_F_GON13654400_meta, 
                                  HD_F_GON13254253_meta, HD_F_GON12877981_meta, 
                                 HD_F_GON13149227_and_HD_F_GON13188002_meta, HD_F_GON12296727_and_HD_F_GON12220625_meta))
dim(metadata)

In [None]:
dim(metadata)

In [None]:
# Early samples   
early <- ArchRProject(
  ArrowFiles = c(
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON13254249/HD_F_GON13254249.arrow', 
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON13254251/HD_F_GON13254251.arrow', 
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON13654401/HD_F_GON13654401.arrow', 
                  '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON14524656/HD_F_GON14524656.arrow', 
      '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON14609875/HD_F_GON14609875.arrow', 
      '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON13654400/HD_F_GON13654400.arrow', 
      '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON12877981/HD_F_GON12877981.arrow', 
      '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON13254253/HD_F_GON13254253.arrow', 
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON13149227_and_HD_F_GON13188002/HD_F_GON13149227_and_HD_F_GON13188002.arrow', 
                '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON12296727_and_HD_F_GON12220625/HD_F_GON12296727_and_HD_F_GON12220625.arrow' 

                ), 

  outputDirectory = "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/early/",
  copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
)

In [None]:
early

In [None]:
paste0("Memory Size = ", round(object.size(early) / 10^6, 3), " MB")

In [None]:
getAvailableMatrices(early)

In [None]:
2+3

### 2. Add sample-level information and filter cells 

In [None]:
metadata <- DataFrame(metadata)

In [None]:
head(row.names(metadata))

In [None]:
early <- early[row.names(metadata)]
early@cellColData <- metadata
early

In [None]:
table(early$Sample)

In [None]:
# Add donor and stage information --> integrate by donor for consistency with scRNA-seq 
library(dplyr)
donor <- case_when(
    early$Sample == "HD_F_GON13149227_and_HD_F_GON13188002" ~ "C142",
    early$Sample == "HD_F_GON13254251" ~ "C152",
    early$Sample == "HD_F_GON13254249" ~ "C152",
    early$Sample == "HD_F_GON12296727_and_HD_F_GON12220625" ~ "C106", 
     early$Sample == "HD_F_GON13654400" ~ "Hrv220",
     early$Sample == "HD_F_GON12877981" ~ "C118",
     early$Sample == "HD_F_GON14524656" ~ "Hrv243",
     early$Sample == "HD_F_GON13254253" ~ "Hrv201",
     early$Sample == "HD_F_GON13654401" ~ "Hrv217",
     early$Sample == "HD_F_GON14609875" ~ "Hrv254",
    )

stage <- case_when(
    early$Sample == "HD_F_GON13149227_and_HD_F_GON13188002" ~ 6.2,
    early$Sample == "HD_F_GON13254251" ~ 7.8,
    early$Sample == "HD_F_GON13254249" ~ 7.8,
    early$Sample == "HD_F_GON12296727_and_HD_F_GON12220625" ~ 6.6, 
    early$Sample == "HD_F_GON13654400" ~ 10.0,
     early$Sample == "HD_F_GON12877981" ~ 10.0,
     early$Sample == "HD_F_GON14524656" ~ 9.0,
     early$Sample == "HD_F_GON13254253" ~ 9.0,
     early$Sample == "HD_F_GON13654401" ~ 10.0,
     early$Sample == "HD_F_GON14609875" ~ 10.0,
    )

sex <- case_when(
    early$Sample == "HD_F_GON13149227_and_HD_F_GON13188002" ~ "female",
    early$Sample == "HD_F_GON13254251" ~ "female",
    early$Sample == "HD_F_GON13254249" ~ "female",
    early$Sample == "HD_F_GON12296727_and_HD_F_GON12220625" ~ "male",
    early$Sample == "HD_F_GON13654400" ~ "female",
     early$Sample == "HD_F_GON12877981" ~ "male",
     early$Sample == "HD_F_GON14524656" ~ "female",
     early$Sample == "HD_F_GON13254253" ~ "female",
     early$Sample == "HD_F_GON13654401" ~ "male",
     early$Sample == "HD_F_GON14609875" ~ "male",
    
    )

technology <- case_when(
    early$Sample == "HD_F_GON13149227_and_HD_F_GON13188002" ~ "MULTIOME",
    early$Sample == "HD_F_GON13254251" ~ "ATAC",
    early$Sample == "HD_F_GON13254249" ~ "ATAC",
    early$Sample == "HD_F_GON12296727_and_HD_F_GON12220625" ~ "MULTIOME",
    early$Sample == "HD_F_GON13654400" ~ "ATAC",
     early$Sample == "HD_F_GON12877981" ~ "ATAC",
     early$Sample == "HD_F_GON14524656" ~ "ATAC",
     early$Sample == "HD_F_GON13254253" ~ "ATAC",
     early$Sample == "HD_F_GON13654401" ~ "ATAC",
     early$Sample == "HD_F_GON14609875" ~ "ATAC",
    )

early$donor <- donor 
early$stage <- stage
early$technology <- technology
early$sex <- sex

In [None]:
table(early$donor)

In [None]:
table(early$stage)

In [None]:
table(early$sex)

In [None]:
table(early$technology)

### 3. Filter cells with specific TSS enrichment score and Log10(uniqueFragments)

In [None]:
df <- getCellColData(early, select = c("log10(nFrags)", "TSSEnrichment"))
df

In [None]:
p <- ggPoint(
    x = df[,1], 
    y = df[,2], 
    colorDensity = TRUE,
    continuousSet = "sambaNight",
    xlabel = "Log10 Unique Fragments",
    ylabel = "TSS Enrichment",
    xlim = c(log10(500), quantile(df[,1], probs = 0.99)),
    ylim = c(0, quantile(df[,2], probs = 0.99))
) + geom_hline(yintercept = 8, lty = "dashed") + geom_vline(xintercept = 3.25, lty = "dashed")

p

In [None]:
# plotPDF(p, name = "TSS-vs-Frags.pdf", 
#         ArchRProj = early, addDOC = FALSE)

### 4. Plotting sample-level statistics

In [None]:
p1 <- plotGroups(
    ArchRProj = early, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "TSSEnrichment",
    plotAs = "ridges"
   )
p1

In [None]:
p2 <- plotGroups(
    ArchRProj = early, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "TSSEnrichment",
    plotAs = "violin",
    alpha = 0.4,
    addBoxPlot = TRUE
   )
p2

In [None]:
p3 <- plotGroups(
    ArchRProj = early, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "log10(nFrags)",
    plotAs = "ridges"
   )
p3

In [None]:
p4 <- plotGroups(
    ArchRProj = early, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "log10(nFrags)",
    plotAs = "violin",
    alpha = 0.4,
    addBoxPlot = TRUE
   )
p4

In [None]:
# plotPDF(p1,p2,p3,p4, name = "QC-Sample-Statistics.pdf", ArchRProj = early, 
#         addDOC = FALSE, width = 4, height = 4)

In [None]:
p5 <- plotFragmentSizes(ArchRProj = early)
p5

In [None]:
p6 <- plotTSSEnrichment(ArchRProj = early)
p6

In [None]:
# plotPDF(p5,p6, name = "QC-Sample-FragSizes-TSSProfile.pdf", ArchRProj = early, 
#         addDOC = FALSE, width = 5, height = 5)

### 6. Filtering doublets 

In [None]:
#early <- filterDoublets(early)
#early

### 7. Dimensionality reduction with iterative Latent Semantic Indexing

In [None]:
early <- addIterativeLSI(
    ArchRProj = early,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = 1:30
)

In [None]:
2+3

In [None]:
3

In [None]:
###########################
# Extract LSI coordinates #
###########################

lsi.mtx <- getReducedDims(
  ArchRProj = early,
  reducedDims = "IterativeLSI",
  returnMatrix = TRUE,
  dimsToUse = 1:30,
  scaleDims = TRUE,
  corCutOff = 0.75
)
dim(lsi.mtx)

In [None]:
head(lsi.mtx)

In [None]:
class(lsi.mtx)

In [None]:
2 + 3

### 8. Batch effects correction with Mutual Nearest Neighbors 

The convenience of MNN is that we can choose the order of integration of the samples to account for different degrees of variability between samples closer or further away in developmental time

In [None]:
library(uwot) # UMAP 

In [None]:
timepoints <- getCellColData(early)$stage
samples <- getCellColData(early)$Sample

lsi_list <- lapply(unique(timepoints), function(i){
      sub_pc   <- lsi.mtx[timepoints == i, , drop = FALSE]
      sub_samp <- samples[timepoints == i]
      list     <- lapply(unique(sub_samp), function(j){ sub_pc[sub_samp == j, , drop = FALSE]})
      names(list) <- unique(sub_samp)
      return(list)
    })

names(lsi_list) <- unique(timepoints)

In [None]:
unique(timepoints)

In [None]:
timepoint_order <- c(6.2, 6.6, 7.8, 9.0, 10.0)

In [None]:
sample_order <- c("HD_F_GON13149227_and_HD_F_GON13188002", "HD_F_GON12296727_and_HD_F_GON12220625", "HD_F_GON13254251",
                  "HD_F_GON13254249", "HD_F_GON13254253", "HD_F_GON14524656", 
                  "HD_F_GON12877981", "HD_F_GON13654400", "HD_F_GON13654401", "HD_F_GON14609875")

In [None]:
#arrange to match timepoint order
lsi_list <- lsi_list[order(match(names(lsi_list), timepoint_order))]
lsi_list <- lapply(lsi_list, function(x){ x[order(match(names(x), sample_order))]})

In [None]:
library(batchelor)

In [None]:
# remotes::install_version("matrixStats", version="1.1.0") # restart your session and run previous scripts

In [None]:
packageVersion("matrixStats")

In [None]:
packageVersion("htmltools")

In [None]:
#perform corrections within stages
correct_list <- lapply(lsi_list, function(x){
    if(length(x) > 1){
        return(do.call(reducedMNN, x)$corrected)
      } else {
        return(x[[1]])
      }
    })

In [None]:
# perform correction over stages
lsi.mtx <- reducedMNN(correct_list, merge.order=1:length(correct_list))$corrected 
rm(correct_list,lsi_list)

In [None]:
2+3

### 9. UMAP embedding

In [None]:
library(tibble)
sample_metadata <- getCellColData(early) %>% as.data.table(keep.rownames = T)
names(sample_metadata)[names(sample_metadata) == 'rn'] <- 'cell'
head(sample_metadata)

In [None]:
pt.size <- ifelse(nrow(lsi.mtx)>=1e4,0.8,1.2)
n_neighbors <- 25
min_dist <- 0.50
dimred <- "IterativeLSI_MNN"

for (i in n_neighbors) {
  for (j in min_dist) {
    
    # Run UMAP
    set.seed(1996)
    umap_embedding.mtx <- umap(lsi.mtx, n_neighbors=i, min_dist=j, metric="cosine", fast_sgd = TRUE) %>% round(2)
    rownames(umap_embedding.mtx) <- rownames(lsi.mtx)
    
    # Fetch UMAP coordinates
    umap.dt <- umap_embedding.mtx %>%
    as.data.table(keep.rownames = T) %>%
    setnames(c("cell","umap1","umap2"))
    
    to.plot <- umap.dt %>% merge(sample_metadata, by="cell")
  }
}

In [None]:
# Set the plot size
options(repr.plot.width = 11, repr.plot.height = 11)

In [None]:
p <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="Sample")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15), legend.position="bottom")
p

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/early/umap_sample.pdf", width=9, height=9)
print(p)
dev.off()

In [None]:
p2 <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="Clusters2")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15))
p2

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/early/umap_clusters2.pdf", width=9, height=7)
print(p2)
dev.off()

In [None]:
p3 <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="stage")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15), legend.position="bottom")
p3

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/early/umap_stage.pdf", width=9, height=9)
print(p3)
dev.off()

In [None]:
p4 <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="technology")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15), legend.position="bottom")
p4

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/early/umap_technology.pdf", width=9, height=9)
print(p4)
dev.off()

In [None]:

p5 <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="sex")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15), legend.position="bottom")
p5

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/early/umap_sex.pdf", width=9, height=9)
print(p5)
dev.off()

In [None]:
write.csv(to.plot, "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/early/umap_coords.csv", row.names = FALSE)

In [None]:
write.csv(lsi.mtx, "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/early/mnn_embedding.csv", row.names = FALSE)