# Load packages

In [2]:
import rpy2
import rpy2.robjects as robjects
import pandas as pd
import rpy2.robjects.lib.ggplot2 as ggplot2
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
base = importr('base')
import pandas
from IPython.display import Image

grdevices = importr('grDevices')

## To aid in printing HTML in notebooks
import rpy2.ipython.html
rpy2.ipython.html.init_printing()

## To see plots in an output cell
from rpy2.ipython.ggplot import image_png



# User input

In [4]:
filepath_df = "/Users/timnoahkempchen/Library/CloudStorage/GoogleDrive-timkem@stanford.edu/Meine Ablage/Datasets/U54/2023_01_30_subcluster8_and_metadata_incl_xy.csv"
output_dir = "/Users/timnoahkempchen/Library/CloudStorage/GoogleDrive-timkem@stanford.edu/Meine Ablage/Datasets/U54/Output_U54"

# Define columns of input data 
treatment_column = "ben_mal"
cell_index_column = "index"
x_position_column = "x"
y_position_column = "y"
cell_type_column = "subcluster8"
region_column = "unique_region"

# Define Treatment conditions
treatment_condition_1 = "Malignant"
treatment_condition_2 = "Benign"

# Define file path to save avg. distances 
filepath_avg_dist = output_dir

# Define number of iterations for iterated distances
number_of_iterations = 100

# Settings for Dumbbell plot
pairs_for_comparisson_Dumbbell_plot = robjects.StrVector(["unidentified", "stromal", "plasma", "stromal_FAP_CD90", "macrophage_M2", "tumor_CA9", "stromal_TNC", "tumor_PDPN_CA9", "Treg", "mast", "CD8T",
"stromal_CD90","B","DC_CD11c","tumor", "tumor_PDPN", "macrophage_M1", "CD4T", "granulocyte", "vasculature", "NK", "stromal_Vimentin_CXCL12", "DC_HLADR", "myofibroblast", "gdT", "CD4T_DC_CD11c_assoc", 
"CD8T_DC_CD11c_assoc", "LEC", "B_FDC_assoc", "nerve", "epithelial", "B_DC_CD11c_assoc", "FDC"])

## End of user input

# Prepare df

In [5]:
df_full = pd.read_csv(filepath_df)
df = df_full[df_full[treatment_column].isin([treatment_condition_1, treatment_condition_2])]
df = df.rename(columns={df.columns[1]: "index"})
df = df[[cell_index_column, x_position_column, y_position_column, treatment_column, region_column, cell_type_column]]

metadata = (df.groupby([treatment_column, region_column])
.first()
.reset_index())

# Define R functions

In [6]:
get_triangulation_distances = robjects.r('''
     calculate_triangulation_distances <- function(df_input, 
                                              id, 
                                              x_pos, 
                                              y_pos, 
                                              cell_type, 
                                              region) {
  # Compute the rdelaun distances
  vtress <- deldir(df_input[[x_pos]], df_input[[y_pos]])
  rdelaun_result <- vtress$delsgs
  
  # Get interactions going both directions
  inverse_result <- rdelaun_result 
  colnames(inverse_result) <- c("x2", "y2", "x1", "y1", "ind2", "ind1") 
  inverse_result <- inverse_result %>%
    dplyr::select(x1, y1, x2, y2, ind1, ind2)
  
  # Combine distances and annotate results with cell type and region information
  rdelaun_result <- rbind(rdelaun_result,
                         inverse_result) %>%
    dplyr::mutate(cell1ID = paste0(x1, "_", y1),
           cell2ID = paste0(x2, "_", y2))

  annotated_result <- rdelaun_result %>%
    dplyr::left_join(df_input,
              by = c("cell1ID" = "XYcellID")) %>%
    dplyr::rename(celltype1 = {{ cell_type }}) %>%
    dplyr::select(-{{ x_pos }},
           -{{ y_pos }},
           -{{ region }},
           -uniqueID)

  annotated_result <- annotated_result %>%
    dplyr::left_join(df_input,
              by = c("cell2ID" = "XYcellID")) %>%
    dplyr::rename(celltype2 = {{ cell_type }}) %>%
    dplyr::select(x1, y1, celltype1, !!as.symbol(paste0(id, ".x")),
           x2, y2, celltype2, !!as.symbol(paste0(id, ".y")),
           {{ region }})
  
  # Calculate distance and reorder columns
  annotated_result <- annotated_result%>%
    dplyr::mutate(distance = sqrt((x2-x1)^2 + (y2-y1)^2)) %>%
    dplyr::select(!!as.symbol(region), 
           !!as.symbol(paste0(id, ".x")), celltype1, x1, y1,
           !!as.symbol(paste0(id, ".y")), celltype2, x2, y2,
           distance)
  colnames(annotated_result) <- c(region, 
                                  "celltype1_index", "celltype1", "celltype1_X", "celltype1_Y",
                                  "celltype2_index", "celltype2", "celltype2_X", "celltype2_Y", 
                                  "distance")
  return(annotated_result)
}  
       
       
        get_triangulation_distances <- function(df_input, 
                                        id, 
                                        x_pos, 
                                        y_pos, 
                                        cell_type, 
                                        region, 
                                        num_cores = NULL,
                                        csv_output = getwd()) {
  require(tidyverse)
  library(doSNOW)
  library(foreach)
  library(parallel)
  
  # Get unique regions
  unique_regions <- unique(df_input[[region]])
  
  # Select only necessary columns
  df_input <- df_input %>%
    dplyr::select({{ id }},
           {{ x_pos }},
           {{ y_pos }},
           {{ cell_type }},
           {{ region }})
  
  # Set up parallelization
  if (is.null(num_cores)){
    num_cores <- floor(detectCores()/2) # default to using half of available cores
  }
  cl <- makeCluster(num_cores)
  clusterExport(cl, c("calculate_triangulation_distances"))
  registerDoSNOW(cl)
  
  # Progress bar
  pb <- utils::txtProgressBar(max = length(unique_regions), style = 3)
  progress <- function(n) utils::setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  
  triangulation_distances <- foreach(reg_index = 1:length(unique_regions), 
                                     .packages = c("deldir", "tidyverse"), 
                                     .combine = "rbind", 
                                     .options.snow = opts)%dopar%{
    # SUBSET DATASET
    subset <- df_input %>%
      dplyr::filter(!!as.symbol(region) == unique_regions[reg_index]) %>%
      dplyr::mutate(uniqueID = paste0(!!as.symbol(id), "-",
                               !!as.symbol(x_pos), "-",
                               !!as.symbol(y_pos)),
             XYcellID = paste0(!!as.symbol(x_pos),"_", !!as.symbol(y_pos)))
    
    result <- calculate_triangulation_distances(df_input = subset,
                                                id = id,
                                                x_pos = x_pos,
                                                y_pos = y_pos,
                                                cell_type = cell_type,
                                                region = region)
    return(result)
    }
  
  close(pb)
  stopCluster(cl)
  
  return(triangulation_distances)
}
''')

calculate_avg_distance = robjects.r('''
calculate_avg_distance <- function(triangulation_distances = triangulation_distances,
                                   csv_output = getwd()) {
  `%>%` <- magrittr::`%>%`
  
  print("Calculateing the average distance to different cell types on a per individual cell level. This can be interpreted as >> For cell #1, the average distance to a cell of type X is ____. <<")
  # Calculate the average distance to different cell types on a per individual cell level
  # This can be interpreted as
  # "For cell #1, the average distance to a cell of type X is ____."
  per_cell_summary <- triangulation_distances %>%
    dplyr::group_by(celltype1_index, celltype1, celltype2, unique_region) %>%
    dplyr::summarize(per_cell_mean_dist = mean(distance)) %>%
    dplyr::ungroup()
  print(head(per_cell_summary))
  readr::write_csv(per_cell_summary, paste0(csv_output, "/", "per_cell_summary.csv"))
  
  print("Calculateing the average distance between different cell types. This can be interpreted as >> The average distance between cell type X and cell type Y is ___. <<")
  # Calculate the average distance between different cell types
  # This can be interpreted as
  # "The average distance between cell type X and cell type Y is ___."
  per_celltype_summary <- per_cell_summary %>%
    dplyr::group_by(celltype1, celltype2, unique_region) %>%
    dplyr::summarize(mean_dist = mean(per_cell_mean_dist)) %>%
    dplyr::ungroup()
  print(head(per_celltype_summary))
  readr::write_csv(per_celltype_summary, paste0(csv_output, "/", "per_celltype_summary.csv"))
}
''')

iterate_triangulation_distances = robjects.r('''
shuffle_annotations <- function(df_input, 
                                cell_type, 
                                region, 
                                permutation) {
  unique_regions <- unique(df_input[[region]])
  
  df_shuffled <- lapply(1:length(unique_regions),
                        function(region_num){
                          # Subset dataframe
                          df_subset <- df_input %>%
                            dplyr::filter(!!as.symbol(region) == unique_regions[region_num])
                          
                          # Shuffle annotaitons
                          shuffled_annotations <- data.frame(annotations = df_subset[[cell_type]])
                          set.seed(permutation + 1234) # change seed with every permutation
                          rows <- sample(nrow(shuffled_annotations))
                          shuffled_annotations <- data.frame(shuffled_annotations[rows,])
                          colnames(shuffled_annotations) <- c("random_annotations")
                          
                          df_subset <- cbind(df_subset, shuffled_annotations) 
                          
                          return(df_subset)
                        })
  df_shuffled <- do.call(rbind, df_shuffled)
  return(df_shuffled)
}

iterate_triangulation_distances <- function(df_input,
                                            num_iterations = 1000,
                                            id,
                                            x_pos,
                                            y_pos,
                                            cell_type,
                                            region,
                                            num_cores = NULL) {
  library(doSNOW)
  library(foreach)
  library(parallel)
  
  # Get unique regions
  unique_regions <- unique(df_input[[region]])
  
  # Select only necessary columns to speed up computation time
  df_input <- df_input %>%
    dplyr::select(!!as.symbol(id), 
           !!as.symbol(x_pos), 
           !!as.symbol(y_pos), 
           !!as.symbol(cell_type), 
           !!as.symbol(region))
  
  # Set up parallelization
  if (is.null(num_cores)){
    num_cores <- floor(detectCores()/2) # default to using half of available cores
  }
  cl <- makeCluster(num_cores)
  clusterExport(cl, c("shuffle_annotations", "get_triangulation_distances", "calculate_triangulation_distances"))
  registerDoSNOW(cl)
  
  # Progress bar
  pb <- txtProgressBar(max = (length(unique_regions)*num_iterations), style = 3)
  progress <- function(n) utils::setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  
  
  iterative_triangulation_distances <- foreach(reg_index = 1:length(unique_regions)) %:%
    foreach(iteration_index = 1:num_iterations,
            .packages = c("deldir", "tidyverse"),
            .combine = "rbind",
            .options.snow = opts)%dopar%{
              subset <- df_input %>%
                dplyr::filter(!!as.symbol(region) == unique_regions[reg_index]) %>%
                dplyr::mutate(uniqueID = paste0(!!as.symbol(id), "-",
                                         !!as.symbol(x_pos), "-",
                                         !!as.symbol(y_pos)),
                       XYcellID = paste0(!!as.symbol(x_pos),"_", !!as.symbol(y_pos)))
              
              df_shuffled <- shuffle_annotations(df_input = subset,
                                                 cell_type = cell_type,
                                                 region = region,
                                                 permutation = iteration_index)
              
              results <- get_triangulation_distances(df_input = df_shuffled,
                                                     id = id,
                                                     x_pos = x_pos,
                                                     y_pos = y_pos,
                                                     cell_type = "random_annotations",
                                                     region = region,
                                                     num_cores = num_cores
                                                     )
              
              per_cell_summary <- results %>%
                dplyr::group_by(celltype1_index, celltype1, celltype2) %>%
                dplyr::summarize(per_cell_mean_dist = mean(distance)) %>%
                dplyr::ungroup()
              
              per_celltype_summary <- per_cell_summary %>%
                dplyr::group_by(celltype1, celltype2) %>%
                dplyr::summarize(mean_dist = mean(per_cell_mean_dist)) %>%
                dplyr::ungroup() %>%
                dplyr::mutate(region = unique_regions[reg_index],
                       iteration = iteration_index)
              colnames(per_celltype_summary) <- c("celltype1", "celltype2", "mean_dist", region, "iteration")
              
              return(per_celltype_summary)
            }
  
  iterative_triangulation_distances <- do.call(rbind, iterative_triangulation_distances)
  close(pb)
  stopCluster(cl)
  
  return(iterative_triangulation_distances)
}
''')

Dumbbell_plot_interactions = robjects.r('''
Dumbbell_plot_interactions <- function(triangulation_distances = triangulation_distances,
                                       iterated_triangulation_distances = iterated_triangulation_distances,
                                       distance_threshold = 128,
                                       treatment_condition_1 = treatment_condition_1,
                                       treatment_condition_2 = treatment_condition_2,
                                       pair_to = c("CD4+ Treg_Stromal", "CD8+ T cell_NK", "DC_NK", "NK_CD8+ T cell", "NK_CD8+ T cells", "Stromal_CD4+ Treg", "CD4+ T cell_Neutrophil", "CD8+ T cell PD1+_NK", "CD4+ T cell_CD4+ T cell", "CD4+ T cell_CD8+ T cell", "CD8+ T cell_Tumor PDL1+ MHCI+"),
                                       colors = c("#00BFC4","#F8766D"),
                                       output_dir = output_dir,
                                       metadata = metadata,
                                       treatment_column = treatment_column
) {
  
  `%>%` <- magrittr::`%>%`
  names(metadata)[names(metadata) == treatment_column] <- "treatment"
  # Set distance threshold for observed cell-cell interactions
  # distance_threshold = 128  corresponds to 100um
  
  # Reformat observed dataset
  observed_distances <- triangulation_distances %>%
    # Append metadata
    dplyr::left_join(metadata,
                     by = c("unique_region")) %>%
    dplyr::filter(distance <= distance_threshold) %>%
    # Calculate the average distance to every cell type for each cell
    dplyr::group_by(celltype1_index, celltype1, celltype2, treatment, unique_region) %>%
    dplyr::summarize(mean_per_cell = mean(distance)) %>%
    dplyr::ungroup() %>%
    # Calculate the average distance between cell type to cell type on a per group basis
    dplyr::group_by(celltype1, celltype2, treatment) %>%
    dplyr::summarize(observed = list(mean_per_cell),
                     observed_mean = mean(unlist(observed), na.rm = TRUE)) %>%
    dplyr::ungroup()
  
  # Reformat exepcted dataset
  expected_distances <- iterated_triangulation_distances %>%
    # Append metadata
    dplyr::left_join(metadata,
                     by = c("unique_region")) %>%
    dplyr::filter(mean_dist <= distance_threshold) %>%
    # Calculate expected mean distance and list values
    dplyr::group_by(celltype1, celltype2, treatment) %>%
    dplyr::summarize(expected = list(mean_dist),
                     expected_mean = mean(mean_dist, na.rm = TRUE)) %>%
    dplyr::ungroup() 
  
  # Calculate pvalues and log fold differences
  distance_pvals <- expected_distances %>%
    dplyr::left_join(observed_distances,
                     by = c("celltype1", "celltype2", "treatment")) %>%
    # Calculate wilcoxon test between observed and expected distances
    dplyr::group_by(celltype1, celltype2, treatment) %>%
    dplyr::mutate(pvalue = wilcox.test(unlist(expected), unlist(observed), exact = FALSE)$p.value) %>%
    dplyr::ungroup() %>%
    dplyr::select(-observed, -expected) %>%
    # Calculate log fold enrichment
    dplyr::mutate(logfold_group = log2(observed_mean/expected_mean),
                  interaction = paste0(celltype1, " --> ", celltype2)) 
  
  # Get order of plot by magnitude of logfold differences between groups
  intermed <- distance_pvals %>%
    dplyr::select(interaction, treatment, logfold_group) %>%
    tidyr::spread(key = treatment, value = logfold_group) 
  
  intermed$difference <- (intermed[,treatment_condition_2] - intermed[,treatment_condition_1])
  
  ord <-(intermed %>%
    dplyr::filter(!is.na(difference)) %>%
  dplyr::arrange(treatment_condition_1))$interaction
  
  # Assign interaction order
  distance_pvals$interaction <- factor(distance_pvals$interaction,
                                       levels = ord)
  
  # Dumbbell plot

  
  data = distance_pvals %>%
    dplyr::filter(!is.na(interaction))
  
  distance_pvals$pairs = paste0(distance_pvals$celltype1, "_", distance_pvals$celltype2)
  distance_pvals_sub = distance_pvals[distance_pvals$pairs %in%  pair_to, ]
  
 Dumbell_plot <- ggplot2::ggplot(data = distance_pvals_sub %>%
           dplyr::filter(!is.na(interaction))) +
    ggplot2::geom_vline(mapping = ggplot2::aes(xintercept = 0), linetype = "dashed") +
    ggplot2::geom_line(mapping = ggplot2::aes(x = logfold_group, y = interaction),
              na.rm = TRUE) +
    ggplot2::geom_point(mapping = aes(x = logfold_group, y = interaction, fill = treatment, shape = treatment), 
               size = 4, stroke = 0.5, na.rm = TRUE) +
    ggplot2::scale_shape_manual(values = c(24, 22)) + ggplot2::scale_fill_manual(values = colors) +
    ggplot2::theme_bw()+
    ggplot2::theme(panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.text.y = element_text(size = 16),
          axis.text.x = element_text(size = 16, angle = 45, hjust = 1),
          axis.title.y = element_text(size = 16),
          axis.title.x = element_text(size = 16))

          

      ggsave(paste0(output_dir, "/", "Dumbbell_plot", ".png"))

      plot(Dumbell_plot)
      return(Dumbell_plot)
}

''')

# Run analysis 

In [7]:
from rpy2.robjects import pandas2ri


pandas2ri.activate()

triangulation_distances = get_triangulation_distances(df_input = df,
                                                       id = cell_index_column,
                                                       x_pos = x_position_column,
                                                       y_pos = y_position_column,
                                                       cell_type = cell_type_column,
                                                       region = region_column,
                                                       csv_output = filepath_avg_dist)

calculate_avg_distance(triangulation_distances = triangulation_distances,
                                   csv_output = output_dir)

iterated_triangulation_distances = iterate_triangulation_distances(df_input = df,
                                                                      id = cell_index_column,
                                                                      x_pos = x_position_column,
                                                                      y_pos = y_position_column,
                                                                      cell_type = cell_type_column,
                                                                      region = region_column,
                                                                      num_iterations = number_of_iterations)

Dumbbell_plot_interactions(triangulation_distances = triangulation_distances,
                                       iterated_triangulation_distances = iterated_triangulation_distances,
                                       distance_threshold = 128,
                                       treatment_condition_1 = treatment_condition_1,
                                       treatment_condition_2 = treatment_condition_2,
                                       pair_to = pairs_for_comparisson_Dumbbell_plot,
                                       colors = ["#00BFC4","#F8766D"],
                                       output_dir = output_dir,
                                       metadata = metadata,
                                       treatment_column = treatment_column)


Image(filename= output_dir + '/Dumbbell_plot.png')

R[write to console]: Lade nötiges Paket: tidyverse



── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
✔ purrr   0.3.5      
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ purrr::%@%()                 masks rlang::%@%()
✖ purrr::as_function()         masks rlang::as_function()
✖ lazyeval::as_name()          masks rlang::as_name()
✖ lazyeval::call_modify()      masks rlang::call_modify()
✖ lazyeval::call_standardise() masks rlang::call_standardise()
✖ lazyeval::expr_label()       masks rlang::expr_label()
✖ lazyeval::expr_text()        masks rlang::expr_text()
✖ lazyeval::f_env()            masks rlang::f_env()
✖ lazyeval::f_env<-()          masks rlang::f_env<-()
✖ lazyeval::f_label()          masks rlang::f_label()
✖ lazyeval::f_lhs()            masks rlang::f_lhs()
✖ lazyeval::f_lhs<-()          masks rlang::f_lhs<-()
✖ lazyeval::f_rhs()            masks rla

R[write to console]: Lade nötiges Paket: foreach

R[write to console]: 
Attache Paket: ‘foreach’


R[write to console]: Die folgenden Objekte sind maskiert von ‘package:purrr’:

    accumulate, when


R[write to console]: Lade nötiges Paket: iterators

R[write to console]: Lade nötiges Paket: snow

R[write to console]: 
Attache Paket: ‘parallel’


R[write to console]: Die folgenden Objekte sind maskiert von ‘package:snow’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, clusterSplit, makeCluster, parApply,
    parCapply, parLapply, parRapply, parSapply, splitIndices,
    stopCluster




[1] "Calculateing the average distance to different cell types on a per individual cell level. This can be interpreted as >> For cell #1, the average distance to a cell of type X is ____. <<"
`summarise()` has grouped output by 'celltype1_index', 'celltype1',
'celltype2'. You can override using the `.groups` argument.
# A tibble: 6 × 5
  celltype1_index celltype1    celltype2     unique_region per_cell_mean_dist
  <chr>           <chr>        <chr>         <chr>                      <dbl>
1 521_S1reg001_0  unidentified CD8T          521_S1reg001                20.9
2 521_S1reg001_0  unidentified Treg          521_S1reg001                18  
3 521_S1reg001_0  unidentified macrophage_M2 521_S1reg001                33.8
4 521_S1reg001_0  unidentified tumor         521_S1reg001                48.9
5 521_S1reg001_1  stromal      CD8T          521_S1reg001                39.8
6 521_S1reg001_1  stromal      plasma        521_S1reg001                34.9
[1] "Calculateing the average distance

R[write to console]: 

R[write to console]: 



RRuntimeError: StopIteration