In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
import anndata as ad

In [None]:
sc.set_figure_params(frameon=False, dpi=100)

In [None]:
ls /mnt/ssd/atlases/

In [None]:
adata = ad.read_zarr('/mnt/ssd/atlases/Human_Atlas_Harmonised.zarr')

In [None]:
adata.obs['Is_Core'].value_counts()

# compute similarity scores

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import scanpy as sc

# Step 1: Extract latent space and labels
def get_core_latent_df(adata, embedding_key='scanvi_L4_emb', label_key='Level_4'):
    latent_df = pd.DataFrame(adata[adata.obs.Is_Core == 'Core'].obsm[embedding_key], index=adata[adata.obs.Is_Core == 'Core'].obs_names)
    latent_df['cell_type'] = adata.obs[label_key]
    return latent_df, latent_df.groupby('cell_type').mean()

def get_extended_latent_df(adata, embedding_key='scanvi_L4_emb', label_key='Level_4'):
    latent_df = pd.DataFrame(adata[adata.obs.Is_Core != 'Core'].obsm[embedding_key], index=adata[adata.obs.Is_Core != 'Core'].obs_names)
    latent_df['cell_type'] = adata.obs[label_key]
    return latent_df


# Step 2: Compute centroids
def compute_centroids(latent_df):
    return latent_df, latent_df.groupby('cell_type').mean()


def compute_euclidean_distance(latent_df, centroids):
    distances = []
    for _, row in latent_df.iterrows():
        cell_type = row['cell_type']
        embedding = row.drop('cell_type').values.reshape(1, -1)
        centroid = centroids.loc[cell_type].values.reshape(1, -1)
        
        dist = euclidean_distances(embedding, centroid)[0, 0]
        distances.append((dist, cell_type))
    df = pd.DataFrame(distances, index=latent_df.index, columns=['distance', 'cell_type'])
    df['distance'] = df['distance'] / df['distance'].max()
    return df

def run_centroid_analysis(adata, embedding_key='scanvi_L4_emb', label_key='Level_4'):
    # Get core latent space and centroids
    latent_core_raw, centroids = get_core_latent_df(adata, embedding_key=embedding_key, label_key=label_key)
    # Get extended latent space
    latent_extended = get_extended_latent_df(adata, embedding_key=embedding_key, label_key=label_key).dropna()
    
    # Restrict to common cell types
    common_cell_types = centroids.index.intersection(latent_extended['cell_type'].unique())
    centroids = centroids.loc[common_cell_types]
    latent_extended = latent_extended[latent_extended['cell_type'].isin(common_cell_types)]
    latent_core_raw = latent_core_raw[latent_core_raw['cell_type'].isin(common_cell_types)]
    
    # Compute distances to centroids
    distances_extended = compute_euclidean_distance(latent_extended, centroids)
    distances_core = compute_euclidean_distance(latent_core_raw, centroids)
    return distances_extended, distances_core, centroids




In [None]:
adata

In [None]:
distances_extended, distances_core, centroids = run_centroid_analysis(adata, embedding_key='scanvi_extended_atlas_emb', label_key='Level_3')

In [None]:
palette['Extended'] = '#0b559f'

In [None]:
distances_core['source'] = 'Core'
distances_extended['source'] = 'Extended'
dist_all = pd.concat([distances_core, distances_extended], axis=0)


In [None]:
adata.obs['centroid_distances'] = dist_all.distance

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set Seaborn style (no grid)
sns.set(style="white", context='notebook')

# Create the plot
plt.figure(figsize=(36, 12))
ax = sns.boxplot(
    data=dist_all,
    x='cell_type',
    y='distance',
    hue='source',
    palette=palette,
    showfliers=False
)

# Customize plot
plt.title('Euclidean Distance to Centroid: Core vs Extended Cells', fontsize=16)
plt.xlabel('')
plt.ylabel('Euclidean Distance', fontsize=14)
plt.xticks(rotation=90)

# Remove grid explicitly
ax.grid(False)

# Move legend outside the plot
ax.legend(
    title='Source',
    loc='center left',
    bbox_to_anchor=(1.02, 0.5),
    borderaxespad=0,
    frameon=False
)

plt.tight_layout()
plt.savefig(
    "/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure3/eucli_distance_centroids_barplot.png",
    dpi=300,
    bbox_inches="tight"
)
plt.show()


# compute nieghbors entropies

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
from scipy.sparse import csr_matrix
from anndata import AnnData

def compute_weighted_neighbor_label_entropy(
    adata: AnnData,
    connectivities_key: str = "connectivities",
    distances_key: str = "distances",
    label_key: str = "Level_3",
    invert_distances: bool = True,
    distance_epsilon: float = 1e-6
) -> pd.Series:
    """
    Compute Shannon entropy of neighbor labels for each cell, 
    weighting each neighbor by its (inverse) distance.

    Args:
        adata: AnnData with `.obsp[connectivities_key]` and `.obsp[distances_key]`,
              and categorical labels in `adata.obs[label_key]`.
        connectivities_key: key in adata.obsp for the sparse neighbor graph.
                            Defines which cells are “neighbors.”
        distances_key: key in adata.obsp for the sparse distance matrix.
        label_key: column in adata.obs holding the label for each cell.
        invert_distances: if True, weight = 1/(distance + epsilon);
                          if False, weight = distance.
        distance_epsilon: small constant to avoid division by zero when inverting.

    Returns:
        pd.Series of entropy values (base-2) indexed by adata.obs_names.
    """
    # sanity checks
    if connectivities_key not in adata.obsp:
        raise ValueError(f"Graph '{connectivities_key}' not found in adata.obsp.")
    if distances_key not in adata.obsp:
        raise ValueError(f"Distances '{distances_key}' not found in adata.obsp.")
    if label_key not in adata.obs:
        raise ValueError(f"Label column '{label_key}' not found in adata.obs.")

    # load sparse graphs
    conn = adata.obsp[connectivities_key].tocsr()
    dmat = adata.obsp[distances_key].tocsr()

    # prepare labels
    labels = adata.obs[label_key].astype(str).values
    unique_labels = np.unique(labels)
    label_to_idx = {lab: i for i, lab in enumerate(unique_labels)}
    label_indices = np.array([label_to_idx[lab] for lab in labels])

    ent = np.zeros(adata.n_obs, dtype=float)

    for i in range(adata.n_obs):
        nbr_idx = conn[i].indices
        if nbr_idx.size == 0:
            ent[i] = np.nan
            continue

        # get raw distances for these neighbors
        raw_d = dmat[i].data

        # compute weights
        if invert_distances:
            w = 1.0 / (raw_d + distance_epsilon)
        else:
            w = raw_d.copy()

        # accumulate weights per label
        w_counts = np.zeros(len(unique_labels), dtype=float)
        for nbr_label, weight in zip(label_indices[nbr_idx], w):
            w_counts[nbr_label] += weight

        # normalize to a probability distribution
        probs = w_counts / w_counts.sum()
        ent[i] = entropy(probs, base=2)

    return pd.Series(ent, index=adata.obs_names, name="weighted_neighbor_label_entropy")


In [None]:
adata.obs['label_entropy'] = compute_weighted_neighbor_label_entropy(adata)


In [None]:
sc.pl.umap(adata, color = 'label_entropy', vmax = 3)

In [None]:
df['normalized_entropy'].max()* 1.05

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set Seaborn style (no grid)
sns.set(style="white", context='notebook')

# Create the plot
plt.figure(figsize=(36, 12))
ax = sns.boxplot(
    data=adata.obs,
    x='Level_3',
    y='label_entropy',
    hue='Is_Core',
    palette=palette,
    showfliers=False
)

# Customize plot
plt.title('Label entropy in neighbors: Core vs Extended Cells', fontsize=16)
plt.xlabel('')
plt.ylabel('Label Entropy', fontsize=14)
plt.xticks(rotation=90)

# Remove grid explicitly
ax.grid(False)

# Move legend outside the plot
ax.legend(
    title='Source',
    loc='center left',
    bbox_to_anchor=(1.02, 0.5),
    borderaxespad=0,
    frameon=False
)
ax.set_ylim(0, 4e-3)

plt.tight_layout()
plt.savefig(
    "/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure3/neigbors_entropy_barplot.png",
    dpi=300,
    bbox_inches="tight"
)
plt.show()
