In [6]:
import pandas as pd
import anndata as ad
import sys

In [7]:
# read adata object
input_dir = '/Volumes/Tim_Extern/'
adata = ad.read_h5ad(input_dir+"adata_nn_demo_hubmap_intestine_20231014_annotated.h5ad")

In [8]:
df = adata.obs
df.head()

Unnamed: 0,x,y,region_num,donor,tissue,region,area,unique_region,leiden_1,leiden_1.5,leiden_1_subcluster,leiden_2,celltype,leiden_1_subcluster_22,celltype_fine,celltype_fine_new
0,2.019231,35.865385,4,B004,CL,reg004,52.0,B004_CL_reg4,11,14,11,4,Smooth muscle,11,Smooth muscle,Smooth muscle
1,4.009615,110.221154,4,B004,CL,reg004,104.0,B004_CL_reg4,26,35,26,48,ICC,26,ICC,ICC
2,1.315789,347.157895,4,B004,CL,reg004,19.0,B004_CL_reg4,11,14,11,4,Smooth muscle,11,Smooth muscle,Smooth muscle
3,3.132353,705.058824,4,B004,CL,reg004,68.0,B004_CL_reg4,9,13,9,21,Stroma,9,Stroma,Stroma
4,3.740741,1795.537037,4,B004,CL,reg004,54.0,B004_CL_reg4,15,12,15,9,TA,15,Smooth muscle,Smooth muscle


In [9]:
# subset df for metadata
metadata = df[["donor", "tissue", "region", "unique_region"]]
metadata.head()

Unnamed: 0,donor,tissue,region,unique_region
0,B004,CL,reg004,B004_CL_reg4
1,B004,CL,reg004,B004_CL_reg4
2,B004,CL,reg004,B004_CL_reg4
3,B004,CL,reg004,B004_CL_reg4
4,B004,CL,reg004,B004_CL_reg4


In [10]:
df["index"] = df.index

In [None]:

#' @description 
#' 
#' @param df_input: a dataframe containing the original data
#' @param cell_type: a string representing the name of the column containing the cell type annotations
#' @param region: a string representing the name of the column containing the region information
#' @param permutation: an integer representing a specific permutation number to use as a seed for the random number generator
#' 
#' @return The output of this function is a modified version of the input dataframe df_input where the annotations for the cell types are shuffled, 
#' the shuffling is done based on the unique regions


shuffle_annotations <- function(df_input, 
                                cell_type, 
                                region, 
                                permutation) {
  unique_regions <- unique(df_input[[region]])
  
  df_shuffled <- lapply(1:length(unique_regions),
                        function(region_num){
                          # Subset dataframe
                          df_subset <- df_input %>%
                            dplyr::filter(!!as.symbol(region) == unique_regions[region_num])
                          
                          # Shuffle annotaitons
                          shuffled_annotations <- data.frame(annotations = df_subset[[cell_type]])
                          set.seed(permutation + 1234) # change seed with every permutation
                          rows <- sample(nrow(shuffled_annotations))
                          shuffled_annotations <- data.frame(shuffled_annotations[rows,])
                          colnames(shuffled_annotations) <- c("random_annotations")
                          
                          df_subset <- cbind(df_subset, shuffled_annotations) 
                          
                          return(df_subset)
                        })
  df_shuffled <- do.call(rbind, df_shuffled)
  return(df_shuffled)
}



In [None]:
#' Calculate triangulation distances
#' Last Update: 2023-01-18
#' 
#' @description Using delauney triangulation, compute the interactions between cells in a 2D space
#' 
#' @param df_input dataframe containing unique cell id, x position, y position, cell type annotaiton, and region FOV
#' @param id string referring to  name of column containing unique cell id (there should be no duplicates in the dataframe)
#' @param x_pos string referring to name of column containing x location
#' @param y_pos string referring to name of column containing y location
#' @param cell_type string referring to name of column containing cell type annotations
#' @param region string referring to name of column containing region annotations
#' 
#' @return dataframe containing indices, annotation, and XY positions of all 
#' triangulated cell type interactions and their distance

calculate_triangulation_distances <- function(df_input, 
                                              id, 
                                              x_pos, 
                                              y_pos, 
                                              cell_type, 
                                              region) {
  # Compute the rdelaun distances
  vtress <- deldir::deldir(df_input[[x_pos]], df_input[[y_pos]])
  rdelaun_result <- vtress$delsgs
  
  # Get interactions going both directions
  inverse_result <- rdelaun_result 
  colnames(inverse_result) <- c("x2", "y2", "x1", "y1", "ind2", "ind1") 
  inverse_result <- inverse_result %>%
    dplyr::select(x1, y1, x2, y2, ind1, ind2)
  
  # Combine distances and annotate results with cell type and region information
  rdelaun_result <- rbind(rdelaun_result,
                         inverse_result) %>%
    dplyr::mutate(cell1ID = paste0(x1, "_", y1),
           cell2ID = paste0(x2, "_", y2))

  annotated_result <- rdelaun_result %>%
    dplyr::left_join(df_input,
              by = c("cell1ID" = "XYcellID")) %>%
    dplyr::rename(celltype1 = {{ cell_type }}) %>%
    dplyr::select(-{{ x_pos }},
           -{{ y_pos }},
           -{{ region }},
           -uniqueID)

  annotated_result <- annotated_result %>%
    dplyr::left_join(df_input,
              by = c("cell2ID" = "XYcellID")) %>%
    dplyr::rename(celltype2 = {{ cell_type }}) %>%
    dplyr::select(x1, y1, celltype1, !!as.symbol(paste0(id, ".x")),
           x2, y2, celltype2, !!as.symbol(paste0(id, ".y")),
           {{ region }})
  
  # Calculate distance and reorder columns
  annotated_result <- annotated_result%>%
    dplyr::mutate(distance = sqrt((x2-x1)^2 + (y2-y1)^2)) %>%
    dplyr::select(!!as.symbol(region), 
           !!as.symbol(paste0(id, ".x")), celltype1, x1, y1,
           !!as.symbol(paste0(id, ".y")), celltype2, x2, y2,
           distance)
  colnames(annotated_result) <- c(region, 
                                  "celltype1_index", "celltype1", "celltype1_X", "celltype1_Y",
                                  "celltype2_index", "celltype2", "celltype2_X", "celltype2_Y", 
                                  "distance")
  return(annotated_result)
}


In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import Delaunay

def calculate_triangulation_distances(df_input, id, x_pos, y_pos, cell_type, region):
    # Perform Delaunay triangulation
    points = df_input[[x_pos, y_pos]].values
    tri = Delaunay(points)
    indices = tri.simplices
    
    # Get interactions going both directions
    edges = set()
    for simplex in indices:
        for i in range(3):
            for j in range(i + 1, 3):
                edges.add(tuple(sorted([simplex[i], simplex[j]])))
    edges = np.array(list(edges))
    
    # Create dataframe from edges
    rdelaun_result = pd.DataFrame(edges, columns=['ind1', 'ind2'])
    rdelaun_result[['x1', 'y1']] = df_input.iloc[rdelaun_result['ind1']][[x_pos, y_pos]].values
    rdelaun_result[['x2', 'y2']] = df_input.iloc[rdelaun_result['ind2']][[x_pos, y_pos]].values
    
    # Annotate results with cell type and region information
    df_input['XYcellID'] = df_input[x_pos].astype(str) + "_" + df_input[y_pos].astype(str)
    rdelaun_result['cell1ID'] = rdelaun_result['x1'].astype(str) + "_" + rdelaun_result['y1'].astype(str)
    rdelaun_result['cell2ID'] = rdelaun_result['x2'].astype(str) + "_" + rdelaun_result['y2'].astype(str)
    
    annotated_result = pd.merge(rdelaun_result, df_input, left_on='cell1ID', right_on='XYcellID')
    annotated_result = annotated_result.rename(columns={cell_type: 'celltype1', id: 'celltype1_index'})
    annotated_result = annotated_result.drop(columns=[x_pos, y_pos, region, 'XYcellID'])
    
    annotated_result = pd.merge(annotated_result, df_input, left_on='cell2ID', right_on='XYcellID', suffixes=('.x', '.y'))
    annotated_result = annotated_result.rename(columns={cell_type: 'celltype2', id: 'celltype2_index'})
    annotated_result = annotated_result.drop(columns=[x_pos, y_pos, 'XYcellID'])
    
    # Calculate distance
    annotated_result['distance'] = np.sqrt((annotated_result['x2'] - annotated_result['x1']) ** 2 +
                                           (annotated_result['y2'] - annotated_result['y1']) ** 2)
    
    # Reorder columns
    annotated_result = annotated_result[[region, 'celltype1_index', 'celltype1', 'x1', 'y1', 'celltype2_index', 'celltype2', 'x2', 'y2', 'distance']]
    annotated_result.columns = [region, 'celltype1_index', 'celltype1', 'celltype1_X', 'celltype1_Y',
                                'celltype2_index', 'celltype2', 'celltype2_X', 'celltype2_Y', 'distance']
    
    return annotated_result

# Example usage:
# df = pd.read_csv('your_data.csv')
# result_df = calculate_triangulation_distances(df, 'cellID', 'x', 'y', 'cellType', 'regionFOV')


In [None]:

#' @description This is a function in R that calculates the triangulation distances between cells of different types in a given dataset. 
#' 
#' @param df_input: The input dataframe that contains the cell information.
#' @param id: The name of the column in the dataframe that corresponds to the ID of the cells.
#' @param x_pos: The name of the column in the dataframe that corresponds to the x-position of the cells.
#' @param y_pos: The name of the column in the dataframe that corresponds to the y-position of the cells.
#' @param cell_type: The name of the column in the dataframe that corresponds to the type of the cells.
#' @param region: The name of the column in the dataframe that corresponds to the region of the cells.
#' @param num_cores: The number of cores to be used for parallel processing. Defaults to half the number of available cores.
#' @param calc_avg_distance: A Boolean that controls whether the function calculates the average distance between cell types and individual cells. 
#' Defaults to TRUE. The results are stored under the directory, which is defined in csv_output.
#' @param csv_output: The file path where the results of calc_avg_distance will be saved in csv format. Defaults to the working directory.
#' 
#' @return The output of the function is the triangulation distances between cells of different types in the input dataset, 
#' for each region in the dataset. The output is a data frame containing the triangulation distances for each region

get_triangulation_distances <- function(df_input, 
                                        id, 
                                        x_pos, 
                                        y_pos, 
                                        cell_type, 
                                        region, 
                                        num_cores = NULL,
                                        calc_avg_distance = TRUE,
                                        csv_output = getwd()) {
  
  if(typeof(df_input[,x_pos]) != "integer"){
    
    warning("This function expects integer values for xy coordinates.")
    warning("Class will be changed to integer. Please check the generated output!")
    
    i <- c(x_pos, y_pos)   
    df_input[ , i] <- apply(df_input[ , i], 2,            # Specify own function within apply
                        function(x) as.integer(x))
  }
  
  library(doSNOW)
  library(foreach)
  library(parallel)
  
  # Get unique regions
  unique_regions <- unique(df_input[[region]])
  
  # Select only necessary columns
  df_input <- df_input %>%
    dplyr::select({{ id }},
           {{ x_pos }},
           {{ y_pos }},
           {{ cell_type }},
           {{ region }})
  
  # Set up parallelization
  if (is.null(num_cores)){
    num_cores <- floor(detectCores()/2) # default to using half of available cores
  }
  cl <- makeCluster(num_cores)
  clusterExport(cl, c("calculate_triangulation_distances"))
  registerDoSNOW(cl)
  
  # Progress bar
  pb <- utils::txtProgressBar(max = length(unique_regions), style = 3)
  progress <- function(n) utils::setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  
  triangulation_distances <- foreach(reg_index = 1:length(unique_regions), 
                                     .packages = c("deldir", "tidyverse"), 
                                     .combine = "rbind", 
                                     .options.snow = opts)%dopar%{
    # SUBSET DATASET
    subset <- df_input %>%
      dplyr::filter(!!as.symbol(region) == unique_regions[reg_index]) %>%
      dplyr::mutate(uniqueID = paste0(!!as.symbol(id), "-",
                               !!as.symbol(x_pos), "-",
                               !!as.symbol(y_pos)),
             XYcellID = paste0(!!as.symbol(x_pos),"_", !!as.symbol(y_pos)))
    
    result <- calculate_triangulation_distances(df_input = subset,
                                                id = id,
                                                x_pos = x_pos,
                                                y_pos = y_pos,
                                                cell_type = cell_type,
                                                region = region)
    return(result)
    }
  
  close(pb)
  stopCluster(cl)
  
  if(calc_avg_distance == TRUE) {
    calculate_avg_distance(triangulation_distances = triangulation_distances,
                           csv_output = csv_output)
  }
  
  
  return(triangulation_distances)
}


In [15]:
import os
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from functools import partial

def get_triangulation_distances(df_input, id, x_pos, y_pos, cell_type, region,
                                num_cores=None):
    # Check if x_pos and y_pos are integers, and if not, convert them
    if not issubclass(df_input[x_pos].dtype.type, np.integer):
        print("This function expects integer values for xy coordinates.")
        print("Class will be changed to integer. Please check the generated output!")
        df_input[x_pos] = df_input[x_pos].astype(int)
        df_input[y_pos] = df_input[y_pos].astype(int)
    
    # Get unique regions
    unique_regions = df_input[region].unique()
    
    # Select only necessary columns
    df_input = df_input[[id, x_pos, y_pos, cell_type, region]]
    
    # Set up parallelization
    if num_cores is None:
        num_cores = os.cpu_count() // 2  # default to using half of available cores
    
    # Define the process to be parallelized
    def process_region(df, unique_region):
        subset = df[df[region] == unique_region]
        subset['uniqueID'] = subset[id].astype(str) + "-" + subset[x_pos].astype(str) + "-" + subset[y_pos].astype(str)
        subset['XYcellID'] = subset[x_pos].astype(str) + "_" + subset[y_pos].astype(str)
        result = calculate_triangulation_distances(df_input=subset, id=id, x_pos=x_pos, y_pos=y_pos,
                                                   cell_type=cell_type, region=region)
        return result
    
    # Parallel processing
    with ProcessPoolExecutor(max_workers=num_cores) as executor:
        futures = {executor.submit(process_region, df_input, region): region for region in unique_regions}
        triangulation_distances = pd.concat([future.result() for future in futures])
    
    
    return triangulation_distances

# Example usage:
# df = pd.read_csv('your_data.csv')
# result_df = get_triangulation_distances(df, 'cellID', 'x', 'y', 'cellType', 'regionFOV')


In [17]:
# Define the process_region function at the top level
def process_region(df, unique_region, id, x_pos, y_pos, cell_type, region):
    subset = df[df[region] == unique_region].copy()
    subset['uniqueID'] = subset[id].astype(str) + "-" + subset[x_pos].astype(str) + "-" + subset[y_pos].astype(str)
    subset['XYcellID'] = subset[x_pos].astype(str) + "_" + subset[y_pos].astype(str)
    result = calculate_triangulation_distances(df_input=subset, id=id, x_pos=x_pos, y_pos=y_pos,
                                               cell_type=cell_type, region=region)
    return result

def get_triangulation_distances(df_input, id, x_pos, y_pos, cell_type, region,
                                num_cores=None):
    # Check if x_pos and y_pos are integers, and if not, convert them
    if not issubclass(df_input[x_pos].dtype.type, np.integer):
        print("This function expects integer values for xy coordinates.")
        print("Class will be changed to integer. Please check the generated output!")
        df_input[x_pos] = df_input[x_pos].astype(int)
        df_input[y_pos] = df_input[y_pos].astype(int)
    
    # Get unique regions
    unique_regions = df_input[region].unique()
    
    # Select only necessary columns
    df_input = df_input[[id, x_pos, y_pos, cell_type, region]]
    
    # Set up parallelization
    if num_cores is None:
        num_cores = os.cpu_count() // 2  # default to using half of available cores
    
    # Parallel processing
    with ProcessPoolExecutor(max_workers=num_cores) as executor:
        futures = [executor.submit(process_region, df_input, reg, id, x_pos, y_pos, cell_type, region) for reg in unique_regions]
        results = [future.result() for future in futures]
    
    triangulation_distances = pd.concat(results)
    
    
    return triangulation_distances


In [20]:
pip install joblib


Note: you may need to restart the kernel to use updated packages.


In [36]:
import os
import pandas as pd
from joblib import Parallel, delayed
import numpy as np
from scipy.spatial import Delaunay

def calculate_triangulation_distances(df_input, id, x_pos, y_pos, cell_type, region):
    # Perform Delaunay triangulation
    points = df_input[[x_pos, y_pos]].values
    tri = Delaunay(points)
    indices = tri.simplices
    
    # Get interactions going both directions
    edges = set()
    for simplex in indices:
        for i in range(3):
            for j in range(i + 1, 3):
                edges.add(tuple(sorted([simplex[i], simplex[j]])))
    edges = np.array(list(edges))
    
    # Create dataframe from edges
    rdelaun_result = pd.DataFrame(edges, columns=['ind1', 'ind2'])
    rdelaun_result[['x1', 'y1']] = df_input.iloc[rdelaun_result['ind1']][[x_pos, y_pos]].values
    rdelaun_result[['x2', 'y2']] = df_input.iloc[rdelaun_result['ind2']][[x_pos, y_pos]].values
    
    # Annotate results with cell type and region information
    df_input['XYcellID'] = df_input[x_pos].astype(str) + "_" + df_input[y_pos].astype(str)
    rdelaun_result['cell1ID'] = rdelaun_result['x1'].astype(str) + "_" + rdelaun_result['y1'].astype(str)
    rdelaun_result['cell2ID'] = rdelaun_result['x2'].astype(str) + "_" + rdelaun_result['y2'].astype(str)
    
    annotated_result = pd.merge(rdelaun_result, df_input, left_on='cell1ID', right_on='XYcellID')
    annotated_result = annotated_result.rename(columns={cell_type: 'celltype1', id: 'celltype1_index'})
    annotated_result = annotated_result.drop(columns=[x_pos, y_pos, region, 'XYcellID'])
    
    annotated_result = pd.merge(annotated_result, df_input, left_on='cell2ID', right_on='XYcellID', suffixes=('.x', '.y'))
    annotated_result = annotated_result.rename(columns={cell_type: 'celltype2', id: 'celltype2_index'})
    annotated_result = annotated_result.drop(columns=[x_pos, y_pos, 'XYcellID'])
    
    # Calculate distance
    annotated_result['distance'] = np.sqrt((annotated_result['x2'] - annotated_result['x1']) ** 2 +
                                           (annotated_result['y2'] - annotated_result['y1']) ** 2)
    
    # Reorder columns
    annotated_result = annotated_result[[region, 'celltype1_index', 'celltype1', 'x1', 'y1', 'celltype2_index', 'celltype2', 'x2', 'y2', 'distance']]
    annotated_result.columns = [region, 'celltype1_index', 'celltype1', 'celltype1_X', 'celltype1_Y',
                                'celltype2_index', 'celltype2', 'celltype2_X', 'celltype2_Y', 'distance']
    
    return annotated_result


# Define the process_region function at the top level
def process_region(df, unique_region, id, x_pos, y_pos, cell_type, region):
    subset = df[df[region] == unique_region].copy()
    subset['uniqueID'] = subset[id].astype(str) + "-" + subset[x_pos].astype(str) + "-" + subset[y_pos].astype(str)
    subset['XYcellID'] = subset[x_pos].astype(str) + "_" + subset[y_pos].astype(str)
    result = calculate_triangulation_distances(df_input=subset, id=id, x_pos=x_pos, y_pos=y_pos,
                                               cell_type=cell_type, region=region)
    return result

def get_triangulation_distances(df_input, id, x_pos, y_pos, cell_type, region,
                                num_cores=None):
    # Check if x_pos and y_pos are integers, and if not, convert them
    if not issubclass(df_input[x_pos].dtype.type, np.integer):
        print("This function expects integer values for xy coordinates.")
        print("Class will be changed to integer. Please check the generated output!")
        df_input[x_pos] = df_input[x_pos].astype(int)
        df_input[y_pos] = df_input[y_pos].astype(int)
    
    # Get unique regions
    unique_regions = df_input[region].unique()
    
    # Select only necessary columns
    df_input = df_input[[id, x_pos, y_pos, cell_type, region]]
    
    # Set up parallelization
    if num_cores is None:
        num_cores = os.cpu_count() // 2  # default to using half of available cores
    
    # Parallel processing using joblib
    results = Parallel(n_jobs=num_cores)(delayed(process_region)(df_input, reg, id, x_pos, y_pos, cell_type, region)
                                         for reg in unique_regions)
    
    triangulation_distances = pd.concat(results)
    
    
    return triangulation_distances


In [8]:
df.head()

Unnamed: 0,x,y,region_num,donor,tissue,region,area,unique_region,leiden_1,leiden_1.5,leiden_1_subcluster,leiden_2,celltype,leiden_1_subcluster_22,celltype_fine,celltype_fine_new,index
0,2.019231,35.865385,4,B004,CL,reg004,52.0,B004_CL_reg4,11,14,11,4,Smooth muscle,11,Smooth muscle,Smooth muscle,0
1,4.009615,110.221154,4,B004,CL,reg004,104.0,B004_CL_reg4,26,35,26,48,ICC,26,ICC,ICC,1
2,1.315789,347.157895,4,B004,CL,reg004,19.0,B004_CL_reg4,11,14,11,4,Smooth muscle,11,Smooth muscle,Smooth muscle,2
3,3.132353,705.058824,4,B004,CL,reg004,68.0,B004_CL_reg4,9,13,9,21,Stroma,9,Stroma,Stroma,3
4,3.740741,1795.537037,4,B004,CL,reg004,54.0,B004_CL_reg4,15,12,15,9,TA,15,Smooth muscle,Smooth muscle,4


In [11]:
import numpy as np

In [25]:
triangulation_distances = get_triangulation_distances(df_input = df, 
                                                      id = "index", 
                                                      x_pos = "x", 
                                                      y_pos = "y", 
                                                      cell_type = "celltype", 
                                                      region = "unique_region", 
                                                      num_cores=10)

In [26]:
triangulation_distances.head()

Unnamed: 0,unique_region,celltype1_index,celltype1,celltype1_X,celltype1_Y,celltype2_index,celltype2,celltype2_X,celltype2_Y,distance
0,B004_CL_reg4,6527,Nerve,3248,494,6548,TA,3257,393,101.400197
1,B004_CL_reg4,6535,TA,3253,355,6548,TA,3257,393,38.209946
2,B004_CL_reg4,6500,TA,3231,393,6548,TA,3257,393,26.0
3,B004_CL_reg4,6527,Nerve,3248,494,6558,TA,3262,550,57.723479
4,B004_CL_reg4,6454,Smooth muscle,3202,592,6558,TA,3262,550,73.239334


In [27]:
triangulation_distances.shape

(701686, 10)

In [1]:
# Define a function to shuffle annotations within each unique region
def shuffle_region_annotations(df, unique_region):
    df_subset = df[df[region] == unique_region].copy()
    shuffled_annotations = df_subset[cell_type].sample(frac=1).reset_index(drop=True)
    df_subset['random_annotations'] = shuffled_annotations
    return df_subset

def shuffle_annotations(df_input, cell_type, region, permutation):
    np.random.seed(permutation + 1234)  # Set the random seed based on permutation
    unique_regions = df_input[region].unique()

    # Apply the shuffle function to each unique region
    df_shuffled = pd.concat([shuffle_region_annotations(df_input, reg) for reg in unique_regions])

    return df_shuffled

In [None]:

#' @description 
#' 
#' @param df_input: a dataframe containing the original data
#' @param num_iterations: an integer representing the number of iterations to perform (defaults to 1000)
#' @param id: a string representing the name of the column containing the unique IDs of each cell
#' @param x_pos: a string representing the name of the column containing the x-coordinate position of each cell
#' @param y_pos: a string representing the name of the column containing the y-coordinate position of each cell
#' @param cell_type: a string representing the name of the column containing the cell type annotations
#' @param region: a string representing the name of the column containing the region information
#' @param num_cores: an optional integer representing the number of cores to use for parallel computation (defaults to half of available cores)
#' 
#' @return 


iterate_triangulation_distances <- function(df_input,
                                            num_iterations = 1000,
                                            id,
                                            x_pos,
                                            y_pos,
                                            cell_type,
                                            region,
                                            num_cores = NULL) {
  library(doSNOW)
  library(foreach)
  library(parallel)
  
  # Get unique regions
  unique_regions <- unique(df_input[[region]])
  
  # Select only necessary columns to speed up computation time
  df_input <- df_input %>%
    dplyr::select(!!as.symbol(id), 
           !!as.symbol(x_pos), 
           !!as.symbol(y_pos), 
           !!as.symbol(cell_type), 
           !!as.symbol(region))
  
  # Set up parallelization
  if (is.null(num_cores)){
    num_cores <- floor(detectCores()/2) # default to using half of available cores
  }
  cl <- makeCluster(num_cores)
  clusterExport(cl, c("shuffle_annotations", "get_triangulation_distances", "calculate_triangulation_distances"))
  registerDoSNOW(cl)
  
  # Progress bar
  pb <- txtProgressBar(max = (length(unique_regions)*num_iterations), style = 3)
  progress <- function(n) utils::setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  
  
  iterative_triangulation_distances <- foreach(reg_index = 1:length(unique_regions)) %:%
    foreach(iteration_index = 1:num_iterations,
            .packages = c("deldir", "tidyverse"),
            .combine = "rbind",
            .options.snow = opts)%dopar%{
              subset <- df_input %>%
                dplyr::filter(!!as.symbol(region) == unique_regions[reg_index]) %>%
                dplyr::mutate(uniqueID = paste0(!!as.symbol(id), "-",
                                         !!as.symbol(x_pos), "-",
                                         !!as.symbol(y_pos)),
                       XYcellID = paste0(!!as.symbol(x_pos),"_", !!as.symbol(y_pos)))
              
              df_shuffled <- shuffle_annotations(df_input = subset,
                                                 cell_type = cell_type,
                                                 region = region,
                                                 permutation = iteration_index)
              
              results <- get_triangulation_distances(df_input = df_shuffled,
                                                     id = id,
                                                     x_pos = x_pos,
                                                     y_pos = y_pos,
                                                     cell_type = "random_annotations",
                                                     region = region,
                                                     num_cores = num_cores,
                                                     calc_avg_distance = FALSE)
              
              per_cell_summary <- results %>%
                dplyr::group_by(celltype1_index, celltype1, celltype2) %>%
                dplyr::summarize(per_cell_mean_dist = mean(distance)) %>%
                dplyr::ungroup()
              
              per_celltype_summary <- per_cell_summary %>%
                dplyr::group_by(celltype1, celltype2) %>%
                dplyr::summarize(mean_dist = mean(per_cell_mean_dist)) %>%
                dplyr::ungroup() %>%
                dplyr::mutate(region = unique_regions[reg_index],
                       iteration = iteration_index)
              colnames(per_celltype_summary) <- c("celltype1", "celltype2", "mean_dist", region, "iteration")
              
              return(per_celltype_summary)
            }
  
  iterative_triangulation_distances <- do.call(rbind, iterative_triangulation_distances)
  close(pb)
  stopCluster(cl)
  
  return(iterative_triangulation_distances)
}



In [23]:
import os
import pandas as pd
from joblib import Parallel, delayed

# Assuming shuffle_annotations and get_triangulation_distances are already defined

def process_region_iteration(df_input, region_name, iteration, id_col, x_pos, y_pos, cell_type):
    subset = df_input[df_input[region] == region_name].copy()
    subset['uniqueID'] = subset[id_col].astype(str) + "-" + subset[x_pos].astype(str) + "-" + subset[y_pos].astype(str)
    subset['XYcellID'] = subset[x_pos].astype(str) + "_" + subset[y_pos].astype(str)

    df_shuffled = shuffle_annotations(subset, cell_type, region, iteration)

    results = get_triangulation_distances(df_shuffled, id_col, x_pos, y_pos, "random_annotations", region_name, num_cores=1, calc_avg_distance=False)

    # Replace the below code with the appropriate pandas operations to summarize the per-cell and per-cell-type distances
    per_cell_summary = results.groupby(['celltype1_index', 'celltype1', 'celltype2']).distance.mean().reset_index()
    per_celltype_summary = per_cell_summary.groupby(['celltype1', 'celltype2']).mean_dist.mean().reset_index()
    per_celltype_summary['region'] = region_name
    per_celltype_summary['iteration'] = iteration

    return per_celltype_summary

def iterate_triangulation_distances(df_input, id_col, x_pos, y_pos, cell_type, region, num_cores=None, num_iterations=1000):
    unique_regions = df_input[region].unique()
    df_input = df_input[[id_col, x_pos, y_pos, cell_type, region]]

    if num_cores is None:
        num_cores = os.cpu_count() // 2  # default to using half of available cores

    # Create a list of all combinations of regions and iterations
    region_iteration_combinations = [(region_name, i) for region_name in unique_regions for i in range(num_iterations)]

    # Use joblib to parallel process each combination of region and iteration
    iterative_results = Parallel(n_jobs=num_cores)(
        delayed(process_region_iteration)(df_input, region_name, iteration, id_col, x_pos, y_pos, cell_type)
        for region_name, iteration in region_iteration_combinations
    )

    # Concatenate results into a single DataFrame
    iterative_triangulation_distances = pd.concat(iterative_results)
    
    return iterative_triangulation_distances

In [18]:
import numpy as np

In [None]:

#' @description 
#' 
#' @param df_input: a dataframe containing the original data
#' @param cell_type: a string representing the name of the column containing the cell type annotations
#' @param region: a string representing the name of the column containing the region information
#' @param permutation: an integer representing a specific permutation number to use as a seed for the random number generator
#' 
#' @return The output of this function is a modified version of the input dataframe df_input where the annotations for the cell types are shuffled, 
#' the shuffling is done based on the unique regions


shuffle_annotations <- function(df_input, 
                                cell_type, 
                                region, 
                                permutation) {
  unique_regions <- unique(df_input[[region]])
  
  df_shuffled <- lapply(1:length(unique_regions),
                        function(region_num){
                          # Subset dataframe
                          df_subset <- df_input %>%
                            dplyr::filter(!!as.symbol(region) == unique_regions[region_num])
                          
                          # Shuffle annotaitons
                          shuffled_annotations <- data.frame(annotations = df_subset[[cell_type]])
                          set.seed(permutation + 1234) # change seed with every permutation
                          rows <- sample(nrow(shuffled_annotations))
                          shuffled_annotations <- data.frame(shuffled_annotations[rows,])
                          colnames(shuffled_annotations) <- c("random_annotations")
                          
                          df_subset <- cbind(df_subset, shuffled_annotations) 
                          
                          return(df_subset)
                        })
  df_shuffled <- do.call(rbind, df_shuffled)
  return(df_shuffled)
}


In [41]:
import pandas as pd
import numpy as np

def shuffle_annotations(df_input, cell_type, region, permutation):
    # Get unique regions
    unique_regions = df_input[region].unique()
    
    df_shuffled_list = []
    
    # Set the seed for reproducibility
    np.random.seed(permutation + 1234)
    
    for region_name in unique_regions:
        # Subset dataframe by region
        df_subset = df_input[df_input[region] == region_name].copy()
        
        # Shuffle annotations within each subset
        shuffled_annotations = df_subset[cell_type].sample(frac=1).reset_index(drop=True)
        df_subset['random_annotations'] = shuffled_annotations
        
        df_shuffled_list.append(df_subset)
    
    # Combine all subsets back into a single DataFrame
    df_shuffled = pd.concat(df_shuffled_list, ignore_index=True)
    
    return df_shuffled

# Example usage:
# df = pd.read_csv('your_data.csv')
# shuffled_df = shuffle_annotations(df, 'cellType', 'region', permutation=42)


In [39]:
import pandas as pd
import numpy as np

def shuffle_annotations(df_input, cell_type, region, permutation):
    # Set the seed for reproducibility
    np.random.seed(permutation + 1234)
    
    # Shuffle annotations for all data
    shuffled_annotations = df_input[cell_type].sample(frac=1).reset_index(drop=True)
    
    # Map shuffled annotations back to the DataFrame based on sorted regions
    df_input['random_annotations'] = df_input.sort_values(by=region).groupby(region, sort=False).apply(
        lambda x: shuffled_annotations.iloc[x.index].reset_index(drop=True)
    ).reset_index(level=0, drop=True)
    
    return df_input

# Example usage:
# df = pd.read_csv('your_data.csv')
# shuffled_df = shuffle_annotations(df, 'cellType', 'region', permutation=42)


In [62]:
import pandas as pd
import numpy as np

def shuffle_annotations(df_input, cell_type, region, permutation):
    # Get unique regions
    unique_regions = df_input[region].unique()
    print(f"Unique regions: {unique_regions}")

    df_shuffled_list = []

    # Set the seed for reproducibility
    np.random.seed(permutation + 1234)
    
    for region_name in unique_regions:
        # Check for the presence of the region_name in the region column
        if region_name not in df_input[region].values:
            print(f"Region {region_name} not found in the region column.")
            continue

        # Subset dataframe by region
        df_subset = df_input[df_input[region] == region_name].copy()

        # Check for NaN values in the cell_type column
        if df_subset[cell_type].isnull().values.any():
            print(f"NaN values found in the cell_type column for region {region_name}.")

        # Shuffle annotations within each subset
        shuffled_annotations = df_subset[cell_type].sample(frac=1).reset_index(drop=True)
        df_subset['random_annotations'] = shuffled_annotations
        
        # Check the result of shuffling
        if df_subset['random_annotations'].isnull().values.any():
            print(f"NaN values found in the random_annotations column after shuffling for region {region_name}.")
        
        df_shuffled_list.append(df_subset)

    # Combine all subsets back into a single DataFrame
    df_shuffled = pd.concat(df_shuffled_list, ignore_index=True)

    return df_shuffled


In [31]:
def process_region_iteration(df_input, region_name, iteration, id_col, x_pos, y_pos, cell_type):
    subset = df_input[df_input[region] == region_name].copy()
    subset['uniqueID'] = subset[id_col].astype(str) + "-" + subset[x_pos].astype(str) + "-" + subset[y_pos].astype(str)
    subset['XYcellID'] = subset[x_pos].astype(str) + "_" + subset[y_pos].astype(str)

    df_shuffled = shuffle_annotations(subset, cell_type, region, iteration)

    results = get_triangulation_distances(df_shuffled, id_col, x_pos, y_pos, "random_annotations", region_name, num_cores=1, calc_avg_distance=False)

    # Replace the below code with the appropriate pandas operations to summarize the per-cell and per-cell-type distances
    per_cell_summary = results.groupby(['celltype1_index', 'celltype1', 'celltype2']).distance.mean().reset_index()
    per_celltype_summary = per_cell_summary.groupby(['celltype1', 'celltype2']).mean_dist.mean().reset_index()
    per_celltype_summary['region'] = region_name
    per_celltype_summary['iteration'] = iteration

    return per_celltype_summary

In [32]:
def iterate_triangulation_distances(df_input, id, x_pos, y_pos, cell_type, region, num_iterations,
                                num_cores=None):
    # Check if x_pos and y_pos are integers, and if not, convert them
    if not issubclass(df_input[x_pos].dtype.type, np.integer):
        print("This function expects integer values for xy coordinates.")
        print("Class will be changed to integer. Please check the generated output!")
        df_input[x_pos] = df_input[x_pos].astype(int)
        df_input[y_pos] = df_input[y_pos].astype(int)
    
    # Get unique regions
    unique_regions = df_input[region].unique()
    
    # Select only necessary columns
    df_input = df_input[[id, x_pos, y_pos, cell_type, region]]
    
    # Set up parallelization
    if num_cores is None:
        num_cores = os.cpu_count() // 2  # default to using half of available cores
    
    # Use joblib to parallel process each combination of region and iteration
    iterative_results = Parallel(n_jobs=num_cores)(
        delayed(process_region_iteration)(df_input, region_name, num_iterations, id, x_pos, y_pos, cell_type)
        for region_name, iteration in region_iteration_combinations
    )

    # Concatenate results into a single DataFrame
    iterative_triangulation_distances = pd.concat(iterative_results)
    
    
    return triangulation_distances

In [79]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import os

# Assuming shuffle_annotations, get_triangulation_distances, and calculate_triangulation_distances are already defined

def iterate_triangulation_distances(df_input, id_col, x_pos, y_pos, cell_type, region, num_cores=None, num_iterations=1000):
    unique_regions = df_input[region].unique()
    # Use only the necessary columns
    df_input = df_input[[id_col, x_pos, y_pos, cell_type, region]]

    if num_cores is None:
        num_cores = os.cpu_count() // 2  # Default to using half of available cores
    
    # Define a helper function to process each region and iteration
    def process_iteration(region_name, iteration):
        # Filter by region
        subset = df_input[df_input[region] == region_name]
        # Create unique IDs
        subset['uniqueID'] = subset[id_col].astype(str) + "-" + subset[x_pos].astype(str) + "-" + subset[y_pos].astype(str)
        subset['XYcellID'] = subset[x_pos].astype(str) + "_" + subset[y_pos].astype(str)
        
        # Shuffle annotations
        shuffled = shuffle_annotations(subset, cell_type, region, iteration)
  
        # Get triangulation distances
        results = get_triangulation_distances(shuffled, id_col, x_pos, y_pos, "random_annotations", region, num_cores)
        
        
        
        # Summarize results
        per_cell_summary = results.groupby(['celltype1_index', 'celltype1', 'celltype2']).distance.mean().reset_index(name='per_cell_mean_dist')
        per_celltype_summary = per_cell_summary.groupby(['celltype1', 'celltype2']).per_cell_mean_dist.mean().reset_index(name='mean_dist')
        per_celltype_summary[region] = region_name
        per_celltype_summary['iteration'] = iteration
        
        return per_celltype_summary
    
    # Parallel processing for each region and iteration
    results = Parallel(n_jobs=num_cores)(
        delayed(process_iteration)(region_name, iteration)
        for region_name in unique_regions
        for iteration in range(1, num_iterations + 1)
    )
    
    # Combine all results
    iterative_triangulation_distances = pd.concat(results, ignore_index=True)
    
    return iterative_triangulation_distances

# Example usage:
# df = pd.read_csv('your_data.csv')
# iterative_distances = iterate_triangulation_distances(
#     df_input=df,
#     id_col='cellID',
#     x_pos='x',
#     y_pos='y',
#     cell_type='cellType',
#     region='region',
#     num_cores=4,
#     num_iterations=10
# )


In [85]:
iterative_triangulation_distances = iterate_triangulation_distances(df_input=df, 
                                                                    num_iterations=2, 
                                                                    id_col='index', 
                                                                    x_pos='x', 
                                                                    y_pos='y', 
                                                                    cell_type='celltype', 
                                                                    region='unique_region',
                                                                    num_cores = 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [86]:
iterative_triangulation_distances.head(n = 100)

Unnamed: 0,celltype1,celltype2,mean_dist,unique_region,iteration
0,B cell,B cell,,B004_CL_reg4,1
1,B cell,CD4+ T cell,,B004_CL_reg4,1
2,B cell,CD66+ Enterocyte,,B004_CL_reg4,1
3,B cell,CD8+ T cell,,B004_CL_reg4,1
4,B cell,DC,,B004_CL_reg4,1
...,...,...,...,...,...
95,DC,CD8+ T cell,57.535982,B004_CL_reg4,1
96,DC,DC,21.189620,B004_CL_reg4,1
97,DC,Endothelial,41.466869,B004_CL_reg4,1
98,DC,Enterocyte,79.515542,B004_CL_reg4,1


In [91]:
# Filter for B cells
b_cells_df = df[df['celltype'] == 'B cell']

# Count the number of B cells in each unique region
b_cells_counts = b_cells_df.groupby('unique_region').size()
b_cells_counts

unique_region
B004_CL_reg1      0
B004_CL_reg2    535
B004_CL_reg3    111
B004_CL_reg4      0
B004_SB_reg1      0
B004_SB_reg2      0
B004_SB_reg3      0
B004_SB_reg4      0
dtype: int64

In [83]:
# remove nan values from the dataframe python
iterative_triangulation_distances = iterative_triangulation_distances.dropna()

In [60]:
triangulation_distances

NameError: name 'triangulation_distances' is not defined

In [59]:
iterative_triangulation_distances

Unnamed: 0,celltype1,celltype2,mean_dist,unique_region,iteration
24,CD4+ T cell,CD4+ T cell,29.411410,B004_CL_reg4,1
25,CD4+ T cell,CD66+ Enterocyte,37.708766,B004_CL_reg4,1
26,CD4+ T cell,CD8+ T cell,84.840137,B004_CL_reg4,1
27,CD4+ T cell,DC,31.688184,B004_CL_reg4,1
28,CD4+ T cell,Endothelial,40.002309,B004_CL_reg4,1
...,...,...,...,...,...
8459,Unknown,Plasma,37.189444,B004_CL_reg1,2
8460,Unknown,Smooth muscle,53.323813,B004_CL_reg1,2
8461,Unknown,Stroma,87.354217,B004_CL_reg1,2
8462,Unknown,TA,72.641395,B004_CL_reg1,2


In [68]:
df.head

<bound method NDFrame.head of            x     y  region_num donor tissue  region   area unique_region  \
0          2    35           4  B004     CL  reg004   52.0  B004_CL_reg4   
1          4   110           4  B004     CL  reg004  104.0  B004_CL_reg4   
2          1   347           4  B004     CL  reg004   19.0  B004_CL_reg4   
3          3   705           4  B004     CL  reg004   68.0  B004_CL_reg4   
4          3  1795           4  B004     CL  reg004   54.0  B004_CL_reg4   
...      ...   ...         ...   ...    ...     ...    ...           ...   
233976  9068  3818           1  B004     CL  reg001   64.0  B004_CL_reg1   
233977  9068  8333           1  B004     CL  reg001   55.0  B004_CL_reg1   
233978  9068  5428           1  B004     CL  reg001   57.0  B004_CL_reg1   
233979  9068  5486           1  B004     CL  reg001   67.0  B004_CL_reg1   
233980  9069  5337           1  B004     CL  reg001   48.0  B004_CL_reg1   

       leiden_1 leiden_1.5 leiden_1_subcluster leiden_2  

In [None]:
import pandas as pd
import numpy as np

def shuffle_annotations(df_input, cell_type, region, permutation):
    # Set the seed for reproducibility
    np.random.seed(permutation + 1234)
    
    # Shuffle annotations for all data
    shuffled_annotations = df_input[cell_type].sample(frac=1).reset_index(drop=True)
    
    # Map shuffled annotations back to the DataFrame based on sorted regions
    df_input['random_annotations'] = df_input.sort_values(by=region).groupby(region, sort=False).apply(
        lambda x: shuffled_annotations.iloc[x.index].reset_index(drop=True)
    ).reset_index(level=0, drop=True)
    
    return df_input

# Example usage:
# df = pd.read_csv('your_data.csv')
# shuffled_df = shuffle_annotations(df, 'cellType', 'region', permutation=42)

In [71]:
import pandas as pd
import numpy as np

def shuffle_annotations(df_input, cell_type, region, permutation):
    # Set the seed for reproducibility
    np.random.seed(permutation + 1234)
    
    # Shuffle annotations for all data
    shuffled_annotations = df_input[cell_type].sample(frac=1).reset_index(drop=True)
    
    # Map shuffled annotations back to the DataFrame based on sorted regions
    df_input['random_annotations'] = df_input.sort_values(by=region).groupby(region, sort=False).apply(
        lambda x: shuffled_annotations.iloc[x.index].reset_index(drop=True)
    ).reset_index(level=0, drop=True)
    
    return df_input

# Example usage:
# df = pd.read_csv('your_data.csv')
# shuffled_df = shuffle_annotations(df, 'cellType', 'region', permutation=42)

In [73]:
import pandas as pd
import numpy as np

def shuffle_annotations(df_input, cell_type, region, permutation):
    # Set the seed for reproducibility
    np.random.seed(permutation + 1234)

    # Create a copy to avoid modifying the original dataframe
    df_shuffled = df_input.copy()

    # Shuffle annotations within each region
    for region_name in df_shuffled[region].unique():
        region_mask = df_shuffled[region] == region_name
        shuffled_values = df_shuffled.loc[region_mask, cell_type].sample(frac=1).values
        df_shuffled.loc[region_mask, 'random_annotations'] = shuffled_values

    return df_shuffled

In [84]:
test = shuffle_annotations(df_input = df, cell_type = "celltype", region = "unique_region", permutation = 1)