In [24]:
#### Agglomeration of communities ####
# Slightly altered version of Megan Cole's script.
# -Jannes Roelink

import pandas as pd 
import os 
from datetime import datetime 
from sklearn.cluster import AgglomerativeClustering 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [25]:
# Path to datafile
clus_path = 'Rphenograph_Megan_24June_output_31clusters_k115_13ct_fractions.csv'

# Get current directory and specify data path
path = os.getcwd()
data_path = path + '/Data/'

# Set output path 
output_dir = f"{data_path}/agglomerative_clustering_output/"

data = pd.read_csv(f"{clus_path}", index_col=0)

# Remove indexing information for calculations (will be added back later)
df = data.iloc[:,3:]

# Get original number of clusters
clus_num = df['cluster'].max()

df

Unnamed: 0,B cells,Dendritic cells,Dendritic cells CD103,Endothelium,Epithelium,Fibroblasts,Macrophages type1,Macrophages type2,Neutrophils,T cells CD4,T cells CD8,T reg cells,Tumour,cluster
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.062500,0.000000,0.000000,0.000000,0.062500,0.0,0.875000,1
2,0.066667,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.800000,0.000000,0.066667,0.066667,0.0,0.000000,2
3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.153846,0.0,0.846154,1
4,0.038462,0.000000,0.000000,0.423077,0.0,0.0,0.000000,0.000000,0.346154,0.038462,0.115385,0.0,0.038462,3
5,0.000000,0.666667,0.000000,0.047619,0.0,0.0,0.000000,0.190476,0.000000,0.000000,0.095238,0.0,0.000000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83029,0.000000,0.000000,0.000000,0.090909,0.0,0.0,0.000000,0.181818,0.000000,0.181818,0.181818,0.0,0.363636,8
83030,0.000000,0.000000,0.000000,0.272727,0.0,0.0,0.181818,0.000000,0.000000,0.000000,0.181818,0.0,0.363636,9
83031,0.125000,0.062500,0.125000,0.312500,0.0,0.0,0.062500,0.000000,0.250000,0.062500,0.000000,0.0,0.000000,3
83032,0.142857,0.000000,0.214286,0.214286,0.0,0.0,0.071429,0.000000,0.071429,0.214286,0.000000,0.0,0.071429,2


In [27]:
# Create average_neighbours (centroids of all cells in a cluster based on the fractions)
centroid_noclus = df.groupby('cluster').mean()
centroids = centroid_noclus.reset_index()
# Look at first few centroids
centroid_noclus.head()

Unnamed: 0_level_0,B cells,Dendritic cells,Dendritic cells CD103,Endothelium,Epithelium,Fibroblasts,Macrophages type1,Macrophages type2,Neutrophils,T cells CD4,T cells CD8,T reg cells,Tumour
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.001988,0.019817,0.001917,0.010107,0.000975,0.003138,0.005905,0.029784,0.002072,0.001032,0.051794,0.0,0.871469
2,0.008161,0.036077,0.023041,0.049599,0.002246,0.042396,0.118003,0.578763,0.009522,0.078189,0.023813,0.003267,0.026921
3,0.049823,0.010967,0.011922,0.523979,0.012604,0.008869,0.162243,0.01231,0.089208,0.04622,0.032025,0.000582,0.039247
4,0.007995,0.32262,0.009576,0.087777,0.002652,0.022823,0.060727,0.187042,0.00779,0.099831,0.060791,0.005935,0.124441
5,0.138138,0.019407,0.016897,0.13834,0.015751,0.043022,0.040688,0.020081,0.02113,0.37591,0.076142,0.051295,0.043199


In [28]:
#### Agglomerative clustering function ####

def agglomerative_clustering(average_neighbours, avg_noCluster, agglomerate_to):
    
    # Agglomerative clustering to chosen number of communities
    ac1 = AgglomerativeClustering(linkage = 'average', n_clusters = agglomerate_to)
    agglomerate1 = pd.DataFrame(ac1.fit_predict(avg_noCluster))
    agglomerate1.columns = [f"agglomerateto_{agglomerate_to}"]
    agglomerate1[f"agglomerateto_{agglomerate_to}"] += 1
    # Add agglomerated data to average neighbours 
    average_neighbours = average_neighbours.join(agglomerate1)
   
    print(list(average_neighbours))
    
    # Save data with added columns 
    average_neighbours.to_csv(f"{output_dir}aggloclus_clus{agglomerate_to}.csv", index = False)
    print('dataset saved')

    return pd.DataFrame(average_neighbours)

In [30]:
# agglomerate to 18 communities
average_neighbours18= agglomerative_clustering(centroids, centroid_noclus, 18)
average_neighbours18

['cluster', 'B cells', 'Dendritic cells', 'Dendritic cells CD103', 'Endothelium', 'Epithelium', 'Fibroblasts', 'Macrophages type1', 'Macrophages type2', 'Neutrophils', 'T cells CD4', 'T cells CD8', 'T reg cells', 'Tumour', 'agglomerateto_19']
dataset saved


Unnamed: 0,cluster,B cells,Dendritic cells,Dendritic cells CD103,Endothelium,Epithelium,Fibroblasts,Macrophages type1,Macrophages type2,Neutrophils,T cells CD4,T cells CD8,T reg cells,Tumour,agglomerateto_19
0,1,0.001988,0.019817,0.001917,0.010107,0.000975,0.003138,0.005905,0.029784,0.002072,0.001032,0.051794,0.0,0.871469,2
1,2,0.008161,0.036077,0.023041,0.049599,0.002246,0.042396,0.118003,0.578763,0.009522,0.078189,0.023813,0.003267,0.026921,17
2,3,0.049823,0.010967,0.011922,0.523979,0.012604,0.008869,0.162243,0.01231,0.089208,0.04622,0.032025,0.000582,0.039247,16
3,4,0.007995,0.32262,0.009576,0.087777,0.002652,0.022823,0.060727,0.187042,0.00779,0.099831,0.060791,0.005935,0.124441,14
4,5,0.138138,0.019407,0.016897,0.13834,0.015751,0.043022,0.040688,0.020081,0.02113,0.37591,0.076142,0.051295,0.043199,12
5,6,0.019226,0.002925,0.002906,0.585506,0.005333,0.007943,0.061643,0.007527,0.034399,0.013887,0.015813,0.000573,0.24232,13
6,7,0.013139,0.008654,0.003657,0.11983,0.002688,0.026588,0.030886,0.041008,0.006868,0.018154,0.013878,0.000319,0.714331,18
7,8,0.010539,0.071793,0.010968,0.059778,0.003681,0.013578,0.04322,0.084902,0.008808,0.060271,0.111063,0.001698,0.519702,1
8,9,0.029827,0.021383,0.013516,0.218726,0.007845,0.02506,0.130952,0.075006,0.037493,0.068062,0.048883,0.003233,0.320013,6
9,10,0.004248,0.007989,0.004114,0.017807,0.003204,0.00762,0.096584,0.557791,0.005469,0.031256,0.033627,0.003128,0.227165,19


In [32]:
# Add agglomerated clusters to original data and save to output file. 
data_aggl = pd.merge(data,average_neighbours18[['cluster', 'agglomerateto_18']], on='cluster', how='inner')
# Rename the old cluster column to keep track of original number of clusters
data_aggl = data_aggl.rename(columns={'cluster':f'cluster{clus_num}'})
data_aggl.to_csv(f'{output_dir}celldata_agglo_clus_{clus_num}to18_240624.csv')
data_aggl

Unnamed: 0,ROI_ID,source_ID,source_cluster,B cells,Dendritic cells,Dendritic cells CD103,Endothelium,Epithelium,Fibroblasts,Macrophages type1,Macrophages type2,Neutrophils,T cells CD4,T cells CD8,T reg cells,Tumour,cluster31,agglomerateto_19
0,01_MRTX+PD1,Cell1,T cells CD8,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.062500,0.000000,0.000000,0.000000,0.062500,0.0,0.875000,1,2
1,01_MRTX+PD1,Cell10,Macrophages type2,0.066667,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.800000,0.000000,0.066667,0.066667,0.0,0.000000,2,17
2,01_MRTX+PD1,Cell100,Tumour,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.153846,0.0,0.846154,1,2
3,01_MRTX+PD1,Cell1000,T cells CD4,0.038462,0.000000,0.000000,0.423077,0.0,0.0,0.000000,0.000000,0.346154,0.038462,0.115385,0.0,0.038462,3,16
4,01_MRTX+PD1,Cell10000,Dendritic cells,0.000000,0.666667,0.000000,0.047619,0.0,0.0,0.000000,0.190476,0.000000,0.000000,0.095238,0.0,0.000000,4,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83028,08_MRTX+PD1,Cell83031,Fibroblasts,0.000000,0.000000,0.000000,0.090909,0.0,0.0,0.000000,0.181818,0.000000,0.181818,0.181818,0.0,0.363636,8,1
83029,08_MRTX+PD1,Cell83032,Macrophages type1,0.000000,0.000000,0.000000,0.272727,0.0,0.0,0.181818,0.000000,0.000000,0.000000,0.181818,0.0,0.363636,9,6
83030,08_MRTX+PD1,Cell83033,Tumour,0.125000,0.062500,0.125000,0.312500,0.0,0.0,0.062500,0.000000,0.250000,0.062500,0.000000,0.0,0.000000,3,16
83031,08_MRTX+PD1,Cell83034,Tumour,0.142857,0.000000,0.214286,0.214286,0.0,0.0,0.071429,0.000000,0.071429,0.214286,0.000000,0.0,0.071429,2,17
