## Integration of late male samples (>10 PCW)

In [None]:
suppressPackageStartupMessages(library(ArchR))
library(argparse)
library(ArchR)
library(parallel)

### 1. Create ArchR Project file from Arrow files generated with nextflow pipeline

In [None]:
addArchRGenome("hg38")

In [None]:
packageVersion("htmltools")

In [None]:
packageVersion("matrixStats")

data_dir <- "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/" #### Save sample-level metadata that comes from analysing each sample independently 

In [None]:
data_dir <- "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/" 

In [None]:
HD_F_GON14666993_meta <- read.csv(paste0(data_dir, "HD_F_GON14666993/HD_F_GON14666993_metadata.csv"), row.names = 'X')
HD_F_GON11282673_meta <- read.csv(paste0(data_dir, "HD_F_GON11282673/HD_F_GON11282673_metadata.csv"), row.names = 'X')
HD_F_GON11282674_meta <- read.csv(paste0(data_dir, "HD_F_GON11282674/HD_F_GON11282674_metadata.csv"), row.names = 'X')
HD_F_GON13254250_meta <- read.csv(paste0(data_dir, "HD_F_GON13254250/HD_F_GON13254250_metadata.csv"), row.names = 'X')
HD_F_GON13254252_meta <- read.csv(paste0(data_dir, "HD_F_GON13254252/HD_F_GON13254252_metadata.csv"), row.names = 'X')
HD_F_GON15261135_meta <- read.csv(paste0(data_dir, "HD_F_GON15261135/HD_F_GON15261135_metadata.csv"), row.names = 'X')
HD_F_GON15261138_meta <- read.csv(paste0(data_dir, "HD_F_GON15261138/HD_F_GON15261138_metadata.csv"), row.names = 'X')
HD_F_GON15265170_meta <- read.csv(paste0(data_dir, "HD_F_GON15265170/HD_F_GON15265170_metadata.csv"), row.names = 'X')

In [None]:
HCA_F_GON11173193_and_HCA_F_GON11212448_meta <- read.csv(paste0(data_dir, "HCA_F_GON11173193_and_HCA_F_GON11212448/HCA_F_GON11173193_and_HCA_F_GON11212448_metadata.csv"), row.names = 'X')
HD_F_GON12139982_and_HD_F_GON12220623_meta <- read.csv(paste0(data_dir, "HD_F_GON12139982_and_HD_F_GON12220623/HD_F_GON12139982_and_HD_F_GON12220623_metadata.csv"), row.names = 'X')
HD_F_GON12139983_and_HD_F_GON12220624_meta <- read.csv(paste0(data_dir, "HD_F_GON12139983_and_HD_F_GON12220624/HD_F_GON12139983_and_HD_F_GON12220624_metadata.csv"), row.names = 'X')

In [None]:
metadata <- do.call("rbind", list(
                                  HD_F_GON14666993_meta, HD_F_GON11282673_meta, HD_F_GON11282674_meta, 
                                  HD_F_GON13254250_meta, HD_F_GON13254252_meta, HD_F_GON15261135_meta, HD_F_GON15261138_meta, HD_F_GON15265170_meta,
                                 HCA_F_GON11173193_and_HCA_F_GON11212448_meta, HD_F_GON12139982_and_HD_F_GON12220623_meta,
                                 HD_F_GON12139983_and_HD_F_GON12220624_meta))
dim(metadata)

In [None]:
# Late males samples  
males_late <- ArchRProject(
  ArrowFiles = c('/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON14666993/HD_F_GON14666993.arrow', 
                '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON11282673/HD_F_GON11282673.arrow', 
                '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON11282674/HD_F_GON11282674.arrow',
                '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON13254250/HD_F_GON13254250.arrow',
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON13254252/HD_F_GON13254252.arrow',
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON15261135/HD_F_GON15261135.arrow',
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON15261138/HD_F_GON15261138.arrow',
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON15265170/HD_F_GON15265170.arrow',
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HCA_F_GON11173193_and_HCA_F_GON11212448/HCA_F_GON11173193_and_HCA_F_GON11212448.arrow',
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON12139982_and_HD_F_GON12220623/HD_F_GON12139982_and_HD_F_GON12220623.arrow',
                 '/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/HD_F_GON12139983_and_HD_F_GON12220624/HD_F_GON12139983_and_HD_F_GON12220624.arrow'
                    ), 

  outputDirectory = "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/males_late/",
  copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
)

In [None]:
males_late

In [None]:
paste0("Memory Size = ", round(object.size(males_late) / 10^6, 3), " MB")

In [None]:
getAvailableMatrices(males_late)

### 2. Add sample-level metadata and filter cells 

In [None]:
metadata <- DataFrame(metadata)

In [None]:
males_late <- males_late[row.names(males_late)]
males_late@cellColData <- metadata
males_late

In [None]:
table(males_late$Sample)

In [None]:
# Add donor and stage information --> integrate by donor for consistency with scRNA-seq 
library(dplyr)
donor <- case_when(
    males_late$Sample == "HD_F_GON14666993" ~ "Hrv263",
    males_late$Sample == "HD_F_GON11282673" ~ "Hrv116", 
    males_late$Sample == "HD_F_GON11282674" ~ "Hrv117",
    males_late$Sample == "HD_F_GON13254250" ~ "Hrv184", 
    males_late$Sample == "HD_F_GON13254252" ~ "Hrv184", 
    males_late$Sample == "HD_F_GON15261135" ~ "Hrv272", 
    males_late$Sample == "HD_F_GON15261138" ~ "Hrv272", 
    males_late$Sample == "HD_F_GON15265170" ~ "Hrv273", 
    males_late$Sample == "HCA_F_GON11173193_and_HCA_F_GON11212448" ~ "Hrv104", 
    males_late$Sample == "HD_F_GON12139983_and_HD_F_GON12220624" ~ "Hrv129", 
    males_late$Sample == "HD_F_GON12139982_and_HD_F_GON12220623" ~ "Hrv129", 
    )

stage <- case_when(
   males_late$Sample == "HD_F_GON14666993" ~ 14,
     males_late$Sample == "HD_F_GON11282673" ~ 12, 
    males_late$Sample == "HD_F_GON11282674" ~ 20,
    males_late$Sample == "HD_F_GON13254250" ~ 15, 
    males_late$Sample == "HD_F_GON13254252" ~ 15, 
    males_late$Sample == "HD_F_GON15261135" ~ 18, 
    males_late$Sample == "HD_F_GON15261138" ~ 18, 
    males_late$Sample == "HD_F_GON15265170" ~ 20, 
    males_late$Sample == "HCA_F_GON11173193_and_HCA_F_GON11212448" ~ 17, 
    males_late$Sample == "HD_F_GON12139983_and_HD_F_GON12220624" ~ 17, 
    males_late$Sample == "HD_F_GON12139982_and_HD_F_GON12220623" ~ 17, 
    )

technology <- case_when(
    males_late$Sample == "HD_F_GON14666993" ~ "ATAC",
    males_late$Sample == "HD_F_GON11282673" ~ "ATAC", 
    males_late$Sample == "HD_F_GON11282674" ~ "ATAC",
    males_late$Sample == "HD_F_GON13254250" ~ "ATAC", 
    males_late$Sample == "HD_F_GON13254252" ~ "ATAC", 
    males_late$Sample == "HD_F_GON15261135" ~ "ATAC", 
    males_late$Sample == "HD_F_GON15261138" ~ "ATAC", 
    males_late$Sample == "HD_F_GON15265170" ~ "ATAC", 
    males_late$Sample == "HCA_F_GON11173193_and_HCA_F_GON11212448" ~ "MULTIOME", 
    males_late$Sample == "HD_F_GON12139983_and_HD_F_GON12220624" ~ "MULTIOME", 
    males_late$Sample == "HD_F_GON12139982_and_HD_F_GON12220623" ~ "MULTIOME", 
    )

males_late$donor <- donor 
males_late$stage <- stage
males_late$technology <- technology

In [None]:
table(males_late$donor)

In [None]:
table(males_late$stage)

### 3. Filter cells with specific TSS enrichment score and Log10(uniqueFragments)

In [None]:
df <- getCellColData(males_late, select = c("log10(nFrags)", "TSSEnrichment"))
df

In [None]:
p <- ggPoint(
    x = df[,1], 
    y = df[,2], 
    colorDensity = TRUE,
    continuousSet = "sambaNight",
    xlabel = "Log10 Unique Fragments",
    ylabel = "TSS Enrichment",
    xlim = c(log10(500), quantile(df[,1], probs = 0.99)),
    ylim = c(0, quantile(df[,2], probs = 0.99))
) + geom_hline(yintercept = 8, lty = "dashed") + geom_vline(xintercept = 3.25, lty = "dashed")

p

In [None]:
# plotPDF(p, name = "TSS-vs-Frags.pdf", 
#         ArchRProj = males_late, addDOC = FALSE)

### 4. Plotting sample-level statistics

In [None]:
p1 <- plotGroups(
    ArchRProj = males_late, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "TSSEnrichment",
    plotAs = "ridges"
   )
p1

In [None]:
p2 <- plotGroups(
    ArchRProj = males_late, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "TSSEnrichment",
    plotAs = "violin",
    alpha = 0.4,
    addBoxPlot = TRUE
   )
p2

In [None]:
p3 <- plotGroups(
    ArchRProj = males_late, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "log10(nFrags)",
    plotAs = "ridges"
   )
p3

In [None]:
p4 <- plotGroups(
    ArchRProj = males_late, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "log10(nFrags)",
    plotAs = "violin",
    alpha = 0.4,
    addBoxPlot = TRUE
   )
p4

In [None]:
# plotPDF(p1,p2,p3,p4, name = "QC-Sample-Statistics.pdf", ArchRProj = males_late, 
#         addDOC = FALSE, width = 4, height = 4)

In [None]:
p5 <- plotFragmentSizes(ArchRProj = males_late)
p5

In [None]:
p6 <- plotTSSEnrichment(ArchRProj = males_late)
p6

In [None]:
# plotPDF(p5,p6, name = "QC-Sample-FragSizes-TSSProfile.pdf", ArchRProj = males_late, 
#         addDOC = FALSE, width = 5, height = 5)

### 6. Filtering doublets 

In [None]:
# males_late <- filterDoublets(males_late)
# males_late

### 7. Dimensionality reduction with iterative Latent Semantic Indexing

In [None]:
packageVersion("htmltools")

In [None]:
males_late <- addIterativeLSI(
    ArchRProj = males_late,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = 1:30
)

In [None]:
###########################
# Extract LSI coordinates #
###########################

lsi.mtx <- getReducedDims(
  ArchRProj = males_late,
  reducedDims = "IterativeLSI",
  returnMatrix = TRUE,
  dimsToUse = 1:30,
  scaleDims = TRUE,
  corCutOff = 0.75
)
dim(lsi.mtx)

### 8. Batch effects correction with Mutual Nearest Neighbors 

The convenience of MNN is that we can choose the order of integration of the samples to account for different degrees of variability between samples closer or further away in developmental time

In [None]:
library(uwot) # UMAP 

In [None]:
timepoints <- getCellColData(males_late)$stage
table(timepoints)

In [None]:
samples <- getCellColData(males_late)$Sample
table(samples)

In [None]:
lsi_list <- lapply(unique(timepoints), function(i){
      sub_pc   <- lsi.mtx[timepoints == i, , drop = FALSE]
      sub_samp <- samples[timepoints == i]
      list     <- lapply(unique(sub_samp), function(j){ sub_pc[sub_samp == j, , drop = FALSE]})
      names(list) <- unique(sub_samp)
      return(list)
    })

In [None]:
timepoints <- getCellColData(males_late)$stage
samples <- getCellColData(males_late)$Sample

lsi_list <- lapply(unique(timepoints), function(i){
      sub_pc   <- lsi.mtx[timepoints == i, , drop = FALSE]
      sub_samp <- samples[timepoints == i]
      list     <- lapply(unique(sub_samp), function(j){ sub_pc[sub_samp == j, , drop = FALSE]})
      names(list) <- unique(sub_samp)
      return(list)
    })

names(lsi_list) <- unique(timepoints)

In [None]:
table(getCellColData(males_late)$Sample, getCellColData(males_late)$stage)

In [None]:
names(lsi_list) <- unique(timepoints)
timepoint_order <- c(12, 14, 15, 17, 18, 20) 
sample_order <- c("HD_F_GON11282673", "HD_F_GON14666993", "HD_F_GON13254250", "HD_F_GON13254252", "HCA_F_GON11173193_and_HCA_F_GON11212448", 
                 "HD_F_GON12139982_and_HD_F_GON12220623", "HD_F_GON12139983_and_HD_F_GON12220624", "HD_F_GON15261138", "HD_F_GON15261135", "HD_F_GON15265170", "HD_F_GON11282674")
#arrange to match timepoint order
lsi_list <- lsi_list[order(match(names(lsi_list), timepoint_order))]
lsi_list <- lapply(lsi_list, function(x){ x[order(match(names(x), sample_order))]})

In [None]:
library(batchelor)

In [None]:
#perform corrections within stages
correct_list <- lapply(lsi_list, function(x){
    if(length(x) > 1){
        return(do.call(reducedMNN, x)$corrected)
      } else {
        return(x[[1]])
      }
    })

In [None]:
# perform correction over stages
lsi.mtx <- reducedMNN(correct_list, merge.order=1:length(correct_list))$corrected 
rm(correct_list,lsi_list)

### 9. UMAP embedding

In [None]:
library(tibble)
sample_metadata <- getCellColData(males_late) %>% as.data.table(keep.rownames = T)
names(sample_metadata)[names(sample_metadata) == 'rn'] <- 'cell'
head(sample_metadata)

In [None]:
pt.size <- ifelse(nrow(lsi.mtx)>=1e4,0.8,1.2)
n_neighbors <- 25
min_dist <- 0.50
dimred <- "IterativeLSI_MNN"

for (i in n_neighbors) {
  for (j in min_dist) {
    
    # Run UMAP
    set.seed(1996)
    umap_embedding.mtx <- umap(lsi.mtx, n_neighbors=i, min_dist=j, metric="cosine", fast_sgd = TRUE) %>% round(2)
    rownames(umap_embedding.mtx) <- rownames(lsi.mtx)
    
    # Fetch UMAP coordinates
    umap.dt <- umap_embedding.mtx %>%
    as.data.table(keep.rownames = T) %>%
    setnames(c("cell","umap1","umap2"))
    
    to.plot <- umap.dt %>% merge(sample_metadata, by="cell")
  }
}

In [None]:
p <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="Sample")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15), legend.position="bottom")
p

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/males_late/umap_sample.pdf", width=9, height=9)
print(p)
dev.off()

In [None]:
p2 <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="Clusters2")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15))
p2

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/males_late/umap_clusters2.pdf", width=9, height=9)
print(p2)
dev.off()

In [None]:
p3 <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="stage")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15), legend.position="bottom")
p3

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/males_late/umap_stage.pdf", width=7, height=7)
print(p3)
dev.off()

In [None]:
p4 <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="donor")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15), legend.position="bottom")
p4

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/males_late/umap_donor.pdf", width=7, height=7)
print(p4)
dev.off()

In [None]:
p5 <- ggplot(to.plot, aes_string(x="umap1", y="umap2", fill="technology")) +
        geom_point(size=1, shape=21, stroke=0.05) +
        # ggrastr::geom_point_rast(size=1.5, shape=21, stroke=0.05) +  # DOES NOT WORK IN THE CLUSTER
        theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15), legend.position="bottom")
p5

In [None]:
pdf("/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/males_late/umap_technology.pdf", width=7, height=7)
print(p5)
dev.off()

In [None]:
write.csv(to.plot, "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/males_late/umap_coords.csv", row.names = FALSE)

In [None]:
write.csv(lsi.mtx, "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/males_late/mnn_embedding.csv", row.names = FALSE)