## R scripts for ArchR on a per-sample basis

### 1. QC, dimensionality reduction with LSI (TF-IDF + SVD) and gene scores 

In [None]:
###################
# Import packages #
###################

library(ArchR)
library(parallel)
library(argparse)

In [None]:
#####################
# Define parameters #
#####################

outdir = "/nfs/team292/vl6/FetalReproductiveTract/ATAC_QC/ArchR/"
rnadir = "/nfs/team292/vl6/FetalReproductiveTract/males_post10pcw_iss.rds"
annots = "celltype"

In [None]:
sample = "HD_F_GON11282673"
print(sample)

In [None]:
######################################################
# Create ArchR project from ArrowFiles from Nextflow #
######################################################

# Select version of the genome 
addArchRGenome("hg38")

# Read in ArrowFile and create ArchR project
project <- ArchRProject(
  ArrowFiles = paste0(outdir, sample, '/', sample, '.arrow'),
  outputDirectory = outdir,
  copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
)

project 

In [None]:
################
# QC filtering #
################

df <- getCellColData(project, select = c("log10(nFrags)", "TSSEnrichment"))
p_raw <- ggPoint(
    x = df[,1], 
    y = df[,2], 
    colorDensity = TRUE,
    continuousSet = "sambaNight",
    xlabel = "Log10 Unique Fragments",
    ylabel = "TSS Enrichment",
    xlim = c(log10(500), quantile(df[,1], probs = 0.99)),
    ylim = c(0, quantile(df[,2], probs = 0.99))
) + geom_hline(yintercept = 8, lty = "dashed") + geom_vline(xintercept = 3.25, lty = "dashed")

p_raw

In [None]:
# Log10(nFrags): minimum 3.25. 
idxPass2 <- which(df[,1] >= 3.25)
cellsPass2 <- project$cellNames[idxPass2]
project <- project[cellsPass2, ]

# TSS enrichment score: minimum 10 
idxPass1 <- which(project$TSSEnrichment >= 8)
cellsPass1 <- project$cellNames[idxPass1]
project <- project[cellsPass1, ]

df <- getCellColData(project, select = c("log10(nFrags)", "TSSEnrichment"))
p_filtered <- ggPoint(
    x = df[,1], 
    y = df[,2], 
    colorDensity = TRUE,
    continuousSet = "sambaNight",
    xlabel = "Log10 Unique Fragments",
    ylabel = "TSS Enrichment",
    xlim = c(log10(500), quantile(df[,1], probs = 0.99)),
    ylim = c(0, quantile(df[,2], probs = 0.99))
) + geom_hline(yintercept = 8, lty = "dashed") + geom_vline(xintercept = 3.25, lty = "dashed")

p_filtered

In [None]:
project

In [None]:
############
# QC plots #
############

p_tss <- plotGroups(
    ArchRProj = project, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "TSSEnrichment",
    plotAs = "ridges"
   )
p_tss

In [None]:
p_tss_v <- plotGroups(
    ArchRProj = project, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "TSSEnrichment",
    plotAs = "violin",
    alpha = 0.4,
    addBoxPlot = TRUE
   )
p_tss_v

In [None]:
p_frags <- plotGroups(
    ArchRProj = project, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "log10(nFrags)",
    plotAs = "ridges"
   )
p_frags

In [None]:
p_frags_v <- plotGroups(
    ArchRProj = project, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "log10(nFrags)",
    plotAs = "violin",
    alpha = 0.4,
    addBoxPlot = TRUE
   )
p_frags_v

In [None]:
p_frag_2 <- plotFragmentSizes(ArchRProj = project)
p_frag_2

In [None]:
p_tss_2 <- plotTSSEnrichment(ArchRProj = project)
p_tss_2

In [None]:
######################
# Doublets filtering #
######################

project <- filterDoublets(project)

In [None]:
#####################################
# Dimensionality reduction with LSI #
#####################################

project <- addIterativeLSI(
    ArchRProj = project,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list(resolution = c(2), sampleCells = 10000, maxClusters = 6, n.start
    = 10), 
    varFeatures = 25000, 
    dimsToUse = 1:30, 
    LSIMethod = 2
)

In [None]:
##############
# Clustering #
##############

project <- addClusters(
    input = project,
    reducedDims = "IterativeLSI",
    method = "Seurat",
    name = "Clusters",
    resolution = 2, 
    #maxClusters = 25, 
    knnAssign = 20, 
    force = TRUE
)

table(project$Clusters)

In [None]:
##################
# UMAP embedding #
##################

project <- addUMAP(
    ArchRProj = project, 
    reducedDims = "IterativeLSI", 
    name = "UMAP", 
    nNeighbors = 30, 
    minDist = 0.5, 
    metric = "cosine"
)

In [None]:
p_sample <- plotEmbedding(ArchRProj = project, colorBy = "cellColData", name = "Sample", embedding = "UMAP")
p_sample

In [None]:
p_clusters <- plotEmbedding(ArchRProj = project, colorBy = "cellColData", name = "Clusters", embedding = "UMAP")
p_clusters

In [None]:
###########################
# Compute marker features #
###########################

markersGS <- getMarkerFeatures(
    ArchRProj = project, 
    useMatrix = "GeneScoreMatrix", 
    groupBy = "Clusters",
    bias = c("TSSEnrichment", "log10(nFrags)"),
    testMethod = "wilcoxon"
)

In [None]:
markerList <- getMarkers(markersGS, cutOff = "FDR <= 0.01 & Log2FC >= 1.25")
head(markerList)

In [None]:
markerGenes  <- c(
    "EPCAM", "PAX8", "PAX2", "GATA3", "TP63", "DPYS", "AMN", "WNT9B","KLK11", #Epithelial
    "GATA1", "ALAS2", #Erythroid
    "CDH5", #Endothelial
    "PTPRC", #Immune
    "PDGFRA", "RGS5", "DLK1", "DES",  "PTGER3", "DCN", "EMX2", "HOXA7", "HOXA9", "HOXA10", "HOXD10", "HOXD11", "HOXA13", "ISL1", #Mesenchymal
    "STMN2", "PLP1", "CDK1" #Neural 
  )

In [None]:
heatmapGS <- markerHeatmap(
  seMarker = markersGS, 
  cutOff = "FDR <= 0.01 & Log2FC >= 1.25", 
  labelMarkers = markerGenes,
  transpose = TRUE
)
heatmapGS

In [None]:
ComplexHeatmap::draw(heatmapGS, heatmap_legend_side = "bot", annotation_legend_side = "bot")

In [None]:
p_genes <- plotEmbedding(
    ArchRProj = project, 
    colorBy = "GeneScoreMatrix", 
    name = markerGenes, 
    embedding = "UMAP",
    quantCut = c(0.01, 0.95),
    imputeWeights = NULL, 
    continuousSet = 'whiteBlue'
)
#p_genes

In [None]:
project <- addImputeWeights(project)

In [None]:
p_genes_impute <- plotEmbedding(
    ArchRProj = project, 
    colorBy = "GeneScoreMatrix", 
    name = markerGenes, 
    embedding = "UMAP",
    imputeWeights = getImputeWeights(project)
)
#p_genes_impute

In [None]:
#############################################################
# Assigning cluster identities with scRNA-seq matching data #
#############################################################

rna <- readRDS(rnadir)
rna

In [None]:
print(table(colData(rna)$celltype))

In [None]:
annots

In [None]:
#############################
# Unconstrained integration #
#############################

project <- addGeneIntegrationMatrix(
    ArchRProj = project, 
    useMatrix = "GeneScoreMatrix",
    matrixName = "GeneIntegrationMatrix",
    reducedDims = "IterativeLSI",
    seRNA = rna,
    addToArrow = TRUE,
    force = TRUE,
    groupRNA = annots,
    nameCell = "predictedCell_Un",
    nameGroup = "predictedGroup_Un",
    nameScore = "predictedScore_Un"
)

In [None]:
cM <- as.matrix(confusionMatrix(project$Clusters, project$predictedGroup_Un))
cM

In [None]:
library(ggplot2)

In [None]:
my_palette <- colorRampPalette(c("white", "red"))(n = 100)


In [None]:
# Set the plot size
options(repr.plot.width = 15, repr.plot.height = 15)

In [None]:
to.plot <- melt(cM)
#to.plot$Var1 <- factor(to.plot$Var1, levels=sort(levels(to.plot$Var1)))
to.plot$Var1 <- factor(to.plot$Var1, levels=paste0('C', 1:25))
p <- ggplot(to.plot, aes(x=Var2, y=Var1, fill=value)) +
    geom_tile() +
    geom_text(aes(label = round(value, 1))) +
    scale_fill_gradient(low = "white", high = "red") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(size = 20),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12),
      legend.text = element_text(size = 16))
print(p)

In [None]:
# Set the plot size
options(repr.plot.width = 10, repr.plot.height = 10)

In [None]:
p_clusters

In [None]:
head(project@cellColData)

In [None]:
p_tss <- plotEmbedding(ArchRProj = project, colorBy = "cellColData", name = "TSSEnrichment", embedding = "UMAP")
p_tss

In [None]:
p_nfrags <- plotEmbedding(ArchRProj = project, colorBy = "cellColData", name = "nFrags", embedding = "UMAP")
p_nfrags

In [None]:
p_score <- plotEmbedding(ArchRProj = project, colorBy = "cellColData", name = "predictedScore_Un", embedding = "UMAP")
p_score

In [None]:
preClust <- colnames(cM)[apply(cM, 1 , which.max)]
cbind(preClust, rownames(cM)) #Assignments

In [None]:
# Plot results of unconstrained integration 

p_unconstrained <- plotEmbedding(
    project, 
    colorBy = "cellColData", 
    embedding = "UMAP",
    name = "predictedGroup_Un"
)
p_unconstrained

In [None]:
project <- addGeneIntegrationMatrix(
    ArchRProj = project, 
    useMatrix = "GeneScoreMatrix",
    matrixName = "GeneIntegrationMatrix",
    reducedDims = "IterativeLSI",
    seRNA = rna,
    addToArrow = TRUE,
    force= TRUE,
    groupRNA = annots,
    nameCell = "predictedCell",
    nameGroup = "predictedGroup",
    nameScore = "predictedScore", 

)

In [None]:
project <- addImputeWeights(project)

In [None]:
GeneIntegration <- getMatrixFromProject(
  ArchRProj = project,
  useMatrix = "GeneIntegrationMatrix",
)

In [None]:
markerGenes <- intersect(markerGenes, rowData(GeneIntegration)$name)

In [None]:
p_integ <- plotEmbedding(
    ArchRProj = project, 
    colorBy = "GeneIntegrationMatrix", 
    name = markerGenes, 
    continuousSet = "horizonExtra",
    embedding = "UMAP",
    imputeWeights = getImputeWeights(project)
)
# p_integ

In [None]:
p_integ2 <- plotEmbedding(
    ArchRProj = project, 
    colorBy = "GeneScoreMatrix", 
    continuousSet = "horizonExtra",
    name = markerGenes, 
    embedding = "UMAP",
    imputeWeights = getImputeWeights(project)
)
#p_integ2

In [None]:
cM <- confusionMatrix(project$Clusters, project$predictedGroup)
labelOld <- rownames(cM)
labelNew <- colnames(cM)[apply(cM, 1, which.max)]
project$Clusters2 <- mapLabels(project$Clusters, newLabels = labelNew, oldLabels = labelOld)

In [None]:
table(project$Clusters, project$Clusters2)

In [None]:
confusion_matrix <- as.data.frame(table(project$Clusters, project$predictedGroup, useNA = "ifany"))
ggplot(data = confusion_matrix,
       mapping = aes(x = Var2,
                     y = Var1)) +
  geom_tile(aes(fill = Freq)) +
  geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
  scale_fill_gradient(low = "white",
                      high = "coral") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
                                             text = element_text(size = 15))

In [None]:
# Plot results of unconstrained integration 

p_clusters2 <- plotEmbedding(
    project, 
    colorBy = "cellColData", 
    embedding = "UMAP",
    name = "Clusters2"
)
p_clusters2

In [None]:
p_unconstrained

In [None]:
table(project$predictedGroup_Un)

In [None]:
# Save ArchR project 
#project <- saveArchRProject(ArchRProj = project, outputDirectory = paste0(outdir, sample, "/"), load = FALSE)

In [None]:
outdir

In [None]:
sample

In [None]:
write.csv(project@cellColData, paste0(outdir, sample, "/HD_F_GON11282673_metadata.csv"), row.names = TRUE)