In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad

# read the data

In [None]:
#mouse = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/Mouse_Atlas_Harmonised.h5ad')
#human = sc.read_h5ad('/mnt/storage/Daniele/atlases/human/Human_Atlas_Harmonised.h5ad')

In [None]:
mouse = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/Mouse_Atlas_Harmonised_embedding.h5ad')
human = sc.read_h5ad('/mnt/storage/Daniele/atlases/human/Human_Atlas_Harmonised_embedding.h5ad')

In [None]:
from pynndescent import NNDescent
from scipy import sparse
from typing import Optional, Union, Mapping, Literal
import warnings

def gaussian_kernel(d, sigma = None):
    if sigma is None:
        sigma = np.max(d) / 3
    gauss = np.exp(-0.5 * np.square(d) / np.square(sigma))
    return gauss

def nn2adj(nn,
           n1 = None,
           n2 = None,
           weight: Literal['unweighted','dist','gaussian_kernel'] = 'unweighted',
           sigma = None
          ):
    if n1 is None:
        n1 = nn[0].shape[0]
    if n2 is None:
        n2 = np.max(nn[0].flatten())
    
    df = pd.DataFrame({'i' : np.repeat(range(nn[0].shape[0]), nn[0].shape[1]),
                       'j' : nn[0].flatten(),
                       'x' : nn[1].flatten()})
    
    if weight == 'unweighted':
        adj = sparse.csr_matrix((np.repeat(1, df.shape[0]), (df['i'], df['j'])), shape=(n1, n2))
    else:
        if weight == 'gaussian_kernel':
            df['x'] = gaussian_kernel(df['x'], sigma)
        adj = sparse.csr_matrix((df['x'], (df['i'], df['j'])), shape=(n1, n2))
    
    return adj

def build_nn(ref,
             query = None,
             k = 100,
             weight: Literal['unweighted','dist','gaussian_kernel'] = 'unweighted',
             sigma = None
            ):
    if query is None:
        query = ref
    
    index = NNDescent(ref)
    knn = index.query(query, k=k)
    adj = nn2adj(knn, n1 = query.shape[0], n2 = ref.shape[0], weight = weight, sigma = sigma)
    return adj

def build_mutual_nn(dat1, dat2 = None, k1 = 100, k2 = None):
    if dat2 is None:
        dat2 = dat1
    if k2 is None:
        k2 = k1
    
    index_1 = NNDescent(dat1)
    index_2 = NNDescent(dat2)
    knn_21 = index_1.query(dat2, k=k1)
    knn_12 = index_2.query(dat1, k=k2)
    adj_21 = nn2adj(knn_21, n1 = dat2.shape[0], n2 = dat1.shape[0])
    adj_12 = nn2adj(knn_12, n1 = dat1.shape[0], n2 = dat2.shape[0])
    
    adj_mnn = adj_12.multiply(adj_21.T)
    return adj_mnn

def get_transition_prob_mat(dat, k = 50, symm = True):
    index = NNDescent(dat)
    knn = index.query(dat, k = k)
    adj = nn2adj(knn, n1 = dat.shape[0], n2 = dat.shape[0])
    if symm:
        adj = ((adj + adj.T) > 0) + 0
    prob = sparse.diags(1 / np.array(adj.sum(1)).flatten()) @ adj.transpose()
    return prob

def random_walk_with_restart(init, transition_prob, alpha = 0.5, num_rounds = 100):
    init = np.array(init).flatten()
    heat = init[:,None]
    for i in range(num_rounds):
        heat = init[:,None] * alpha + (1 - alpha) * (transition_prob.transpose() @ heat)
    return heat

def get_wknn(ref,                                                                                                    # the ref representation to build ref-query neighbor graph
             query,                                                                                                  # the query representation to build ref-query neighbor graph
             ref2 = None,                                                                                            # the ref representation to build ref-ref neighbor graph
             k: int = 100,                                                                                           # number of neighbors per cell
             query2ref: bool = True,                                                                                 # consider query-to-ref neighbors
             ref2query: bool = True,                                                                                 # consider ref-to-query neighbors
             weighting_scheme: Literal['n','top_n','jaccard','jaccard_square','gaussian','dist'] = 'jaccard_square', # how to weight edges in the ref-query neighbor graph
             top_n: Optional[int] = None,
             sigma: Optional[float] = None,
             return_adjs: bool = False
            ):
    adj_q2r = build_nn(ref = ref, query = query, k = k, weight = 'dist' if weighting_scheme in ['gaussian', 'dist'] else 'unweighted')
    
    adj_r2q = None
    if ref2query:
        adj_r2q = build_nn(ref = query, query = ref, k = k, weight = 'dist' if weighting_scheme in ['gaussian', 'dist'] else 'unweighted')
    
    if query2ref and not ref2query:
        adj_knn = adj_q2r.T
    elif ref2query and not query2ref:
        adj_knn = adj_r2q
    elif ref2query and query2ref:
        adj_knn_shared = (adj_r2q > 0).multiply(adj_q2r.T > 0)
        adj_knn = adj_r2q + adj_q2r.T - adj_r2q.multiply(adj_knn_shared)
    else:
        warnings.warn('At least one of query2ref and ref2query should be True. Reset to default with both being True.')
        adj_knn_shared = (adj_r2q > 0).multiply(adj_q2r.T > 0)
        adj_knn = adj_r2q + adj_q2r.T - adj_r2q.multiply(adj_knn_shared)
    
    if weighting_scheme in ['n','top_n','jaccard','jaccard_square']:
        if ref2 is None:
            ref2 = ref
        adj_ref = build_nn(ref = ref2, k=k)
        num_shared_neighbors = adj_q2r @ adj_ref.T
        num_shared_neighbors_nn = num_shared_neighbors.multiply(adj_knn.T)

        wknn = num_shared_neighbors_nn.copy()
        if weighting_scheme == 'top_n':
            if top_n is None:
                top_n = k//4 if k > 4 else 1
            wknn = (wknn > top_n) * 1
        elif weighting_scheme == "jaccard":
            wknn.data = wknn.data / (k+k-wknn.data)
        elif weighting_scheme == "jaccard_square":
            wknn.data = (wknn.data / (k+k-wknn.data)) ** 2
    else:
        wknn = adj_knn.T
        if weighting_scheme == 'gaussian':
            wknn.data = gaussian_kernel(wknn.data, sigma = sigma)
    
    if return_adjs:
        adjs = {'q2r' : adj_q2r,
                'r2q' : adj_r2q,
                'knn' : adj_knn,
                'r2r' : adj_ref}
        return (wknn, adjs)
    else:
        return wknn

# malignant

In [None]:
wknn_scanvi_q2r, adjs_scanvi_q2r = get_wknn(ref = human[human.obs.Level_3.str.contains('Malignant')].X,
                                            query = mouse[mouse.obs.Level_3.str.contains('Malignant')].X,
                                            k = 1000,
                                            query2ref = True,
                                            ref2query = False,
                                            weighting_scheme = "jaccard_square",
                                            return_adjs = True)
trans_prob_ref = get_transition_prob_mat(human[human.obs.Level_3.str.contains('Malignant')].X, k=500)




In [None]:
# mouse > human

wknn_scanvi_q2r_per_model = [ np.array(wknn_scanvi_q2r[mouse[mouse.obs.Level_3.str.contains('Malignant')].obs.Model == x,:].sum(axis = 0)).flatten() for x in mouse.obs.Model.cat.categories ]
wknn_scanvi_q2r_per_model_sm = [ random_walk_with_restart(init = x, transition_prob = trans_prob_ref, alpha = 0.1) for x in wknn_scanvi_q2r_per_model ]
df_wknn_scanvi_q2r = pd.DataFrame(np.concatenate(wknn_scanvi_q2r_per_model_sm, axis=1), columns=mouse.obs.Model.cat.categories, index=human[human.obs.Level_3.str.contains('Malignant')].obs_names)
df_wknn_scanvi_q2r_norm = df_wknn_scanvi_q2r.apply(lambda x: np.log1p(x), axis=0).apply(lambda x: np.clip(x, np.percentile(x,1), np.percentile(x,99))).apply(lambda x: (x-np.min(x))/(np.max(x)-np.min(x)))



In [None]:
i = 0
human.obs['cov_score'] = df_wknn_scanvi_q2r_norm.loc[:,'endogenous']

sc.pl.embedding(human, basis='umap', color = ['cov_score'], color_map = 'viridis', title=["endogenous"],
                vmin=0, vmax=1, show=False, frameon=False)

plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_pres_score_orthotopic.png", dpi = 300)


In [None]:
i = 1
human.obs['cov_score'] = df_wknn_scanvi_q2r_norm.loc[:,'orthotopic']

sc.pl.embedding(human, basis='umap', color = ['cov_score'], color_map = 'viridis', title=["orthotopic"],
                vmin=0, vmax=1, show=False, frameon=False)
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_pres_score_endogenous.png", dpi = 300)


In [None]:
human.obs['log_num_wknn_scanvi_q2r_ds_max'] = df_wknn_scanvi_q2r_norm.max(1)


In [None]:
human.obs['log_num_wknn_scanvi_q2r_ds_endogenous'] = df_wknn_scanvi_q2r_norm.loc[:, 'endogenous']
human.obs['log_num_wknn_scanvi_q2r_ds_orthotopic'] = df_wknn_scanvi_q2r_norm.loc[:, 'orthotopic']


In [None]:
human.obs['observed_endogenous'] = (human.obs['log_num_wknn_scanvi_q2r_ds_endogenous'] > 0.05).astype('category')
human.obs['observed_orthotopic'] = (human.obs['log_num_wknn_scanvi_q2r_ds_orthotopic'] > 0.05).astype('category')


In [None]:
sc.pl.umap(human, color = ['log_num_wknn_scanvi_q2r_ds_orthotopic'], color_map = 'viridis', na_color='white', outline_width = (.05,.001), add_outline=True, vmin=0, title='Coverage (orthotopic)', frameon=False, size=0.2, show=False)
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_pres_score_orthotopic.png", dpi = 300)


In [None]:
sc.pl.umap(human, color = ['log_num_wknn_scanvi_q2r_ds_endogenous'], color_map = 'viridis', add_outline=True, vmin=0, title='Coverage (endogenous)', frameon=False, size=0.2, show=False)
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_pres_score_endogenous.png", dpi = 300)


In [None]:
df_wknn_scanvi_q2r_norm['celltype'] = human.obs['Level_4'].astype(str)

In [None]:
df_long = df_wknn_scanvi_q2r_norm.reset_index().melt(id_vars=["index", "celltype"], value_vars=["endogenous", "orthotopic"],
                                var_name="condition", value_name="score")

plt.figure(figsize=(16, 8))

palette = {
    "endogenous": "#ff7f0e",
    "orthotopic": "#2ca02c"
}

sns.boxplot(
    data=df_long,
    x='celltype', y='score', hue='condition',
    dodge=True, showfliers=False,
    palette=palette
)
plt.ylabel("Score")
plt.xlabel("Cell Type")
plt.title("Distribution of Scores by Cell Type and Condition")
plt.xticks(rotation=90)

# Move legend outside top right and set alpha
plt.legend(title="Condition", bbox_to_anchor=(1.05, 1), loc='upper left', frameon=True)
plt.tight_layout()
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/boxplot_pres_score_orthotopic.png", dpi = 300)

plt.show()

# non_malignant

In [None]:
wknn_scanvi_q2r_nm, adjs_scanvi_q2r_nm = get_wknn(ref = human[~human.obs.Level_3.str.contains('Malignant')].X,
                                            query = mouse[~mouse.obs.Level_3.str.contains('Malignant')].X,
                                            k = 1000,
                                            query2ref = True,
                                            ref2query = False,
                                            weighting_scheme = "jaccard_square",
                                            return_adjs = True)
trans_prob_ref_nm = get_transition_prob_mat(human[~human.obs.Level_3.str.contains('Malignant')].X, k=500)




In [None]:
# mouse > human

wknn_scanvi_q2r_per_model_nm = [ np.array(wknn_scanvi_q2r_nm[mouse[~mouse.obs.Level_3.str.contains('Malignant')].obs.Model == x,:].sum(axis = 0)).flatten() for x in mouse.obs.Model.cat.categories ]
wknn_scanvi_q2r_per_model_sm_nm = [ random_walk_with_restart(init = x, transition_prob = trans_prob_ref_nm, alpha = 0.1) for x in wknn_scanvi_q2r_per_model_nm ]
df_wknn_scanvi_q2r_nm = pd.DataFrame(np.concatenate(wknn_scanvi_q2r_per_model_sm_nm, axis=1), columns=mouse.obs.Model.cat.categories, index=human[~human.obs.Level_3.str.contains('Malignant')].obs_names)
df_wknn_scanvi_q2r_norm_nm = df_wknn_scanvi_q2r_nm.apply(lambda x: np.log1p(x), axis=0).apply(lambda x: np.clip(x, np.percentile(x,1), np.percentile(x,99))).apply(lambda x: (x-np.min(x))/(np.max(x)-np.min(x)))



In [None]:
i = 0
human.obs['cov_score'] = df_wknn_scanvi_q2r_norm_nm.loc[:,'endogenous']

sc.pl.embedding(human, basis='umap', color = ['cov_score'], color_map = 'Blues', title=["endogenous"],
                vmin=0, vmax=1, show=False, frameon=False)

In [None]:
i = 1
human.obs['cov_score'] = df_wknn_scanvi_q2r_norm_nm.loc[:,'orthotopic']

sc.pl.embedding(human, basis='umap', color = ['cov_score'], color_map = 'Blues', title=["orthotopic"],
                vmin=0, vmax=1, show=False, frameon=False)

In [None]:
human.obs['log_num_wknn_scanvi_q2r_ds_max'] = df_wknn_scanvi_q2r_norm_nm.max(1)


In [None]:
human.obs['log_num_wknn_scanvi_q2r_ds_endogenous_nm'] = df_wknn_scanvi_q2r_norm_nm.loc[:, 'endogenous']
human.obs['log_num_wknn_scanvi_q2r_ds_orthotopic_nm'] = df_wknn_scanvi_q2r_norm_nm.loc[:, 'orthotopic']


In [None]:
human.obs['observed_endogenous_nm'] = (human.obs['log_num_wknn_scanvi_q2r_ds_endogenous_nm'] > 0.05).astype('category')
human.obs['observed_orthotopic_nm'] = (human.obs['log_num_wknn_scanvi_q2r_ds_orthotopic_nm'] > 0.05).astype('category')


In [None]:
sc.pl.umap(human, color = ['log_num_wknn_scanvi_q2r_ds_orthotopic_nm'], color_map = 'viridis', add_outline=True, vmin=0, title='Coverage (orthotopic)', frameon=False, size=0.2, show=False)
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_pres_score_orthotopic_tme.png", dpi = 300)


In [None]:
sc.pl.umap(human, color = ['log_num_wknn_scanvi_q2r_ds_endogenous_nm'], color_map = 'viridis', add_outline=True, vmin=0, title='Coverage (endogenous)', frameon=False, size=0.2, show=False)
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_pres_score_endogenous_tme.png", dpi = 300)


In [None]:
df_wknn_scanvi_q2r_norm_nm['celltype'] = human.obs['Level_4'].astype(str)

In [None]:
df_long =df_wknn_scanvi_q2r_norm_nm.reset_index().melt(id_vars=["index", "celltype"], value_vars=["endogenous", "orthotopic"],
                                var_name="condition", value_name="score")

plt.figure(figsize=(24, 8))

palette = {
    "endogenous": "#ff7f0e",
    "orthotopic": "#2ca02c"
}

sns.boxplot(
    data=df_long,
    x='celltype', y='score', hue='condition',
    dodge=True, showfliers=False,
    palette=palette
)

plt.ylabel("Score")
plt.xlabel("Cell Type")
plt.title("Distribution of Scores by Cell Type and Condition")
plt.xticks(rotation=90)

# Move legend outside top right and set alpha
plt.legend(title="Condition", bbox_to_anchor=(1.05, 1), loc='upper left', frameon=True)
plt.tight_layout()
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/boxplot_pres_score_orthotopic_tme.png", dpi = 300)

plt.show()

# joint emb

In [None]:
leiden_patient = pd.read_csv('/mnt/storage/Daniele/atlases/patient_leiden_barcode.csv')
leiden_patient_cell_mapping = {k:v for k,v in zip(leiden_patient['Unnamed: 0'], leiden_patient['Patient_Cluster'])}

In [None]:
human.obs['cell_id'] = human.obs_names

In [None]:
human.obs['Patient_Cluster'] = human.obs['cell_id'].map(leiden_patient_cell_mapping)

In [None]:
leiden_patient_mapping = {k:v for k,v in zip(human.obs.Sample_ID, human.obs.Patient_Cluster)}

In [None]:
joint = ad.concat([mouse, human], label = 'Species', keys=['Mouse', 'Human'])

In [None]:
joint.obs["Model"] = mouse.obs.Model

In [None]:
cells_proportion = joint.obs.loc[:, ['Level_4', 'Sample_ID']]
counts = cells_proportion.groupby(['Sample_ID', 'Level_4']).size().reset_index(name='count')
total_per_sample = counts.groupby('Sample_ID')['count'].transform('sum')
counts['proportion'] = counts['count'] / total_per_sample * 100


In [None]:
matrix = counts.pivot(index='Sample_ID', columns='Level_4', values='proportion').fillna(0)

samples_adata = ad.AnnData(
    X=matrix.values,
    obs=pd.DataFrame(index=matrix.index),
    var=pd.DataFrame(index=matrix.columns)
)

In [None]:
sample_to_species = {k:v for k,v in zip(joint.obs.Sample_ID, joint.obs['Species'])}

In [None]:
samples_adata.obs['Species'] = pd.Series(samples_adata.obs_names).replace(sample_to_species).values

In [None]:
from scipy.spatial.distance import cdist
import pandas as pd

# Get sample labels
sample_names = samples_adata.obs_names
sample_to_species = {k:v for k,v in zip(samples_adata.obs_names, samples_adata.obs['Species'])}
# Get indices of mouse and human samples
mouse_indices = samples_adata[samples_adata.obs['Species'] == 'Mouse'].obs_names
human_indices = samples_adata[samples_adata.obs['Species'] == 'Human'].obs_names

# Extract data matrices
mouse_matrix = samples_adata[mouse_indices].X
human_matrix = samples_adata[human_indices].X

# Compute distances between human and mouse samples
distance_matrix = pd.DataFrame(
    cdist(human_matrix, mouse_matrix, metric='euclidean'),  # or 'cosine', 'correlation', etc.
    index=human_indices,
    columns=mouse_indices
)


In [None]:
top_k = 3  # number of top matches per human

# Remove potential name conflict
distance_matrix.index.name = None
distance_matrix.columns.name = None

# Get top-k matches per human
top_k_matches = distance_matrix.apply(lambda row: row.nsmallest(top_k), axis=1)

# Convert to long-form DataFrame
top_k_matches_df = top_k_matches.stack().reset_index()
top_k_matches_df.columns = ['Human_Sample', 'Mouse_Sample', 'Distance']


In [None]:
top_k_matches_df['Model'] = top_k_matches_df['Mouse_Sample'].replace({k:v for k,v in zip(mouse.obs.Sample_ID, mouse.obs.Model)})

In [None]:
top_k_matches_df['Patient_Cluster'] = top_k_matches_df['Human_Sample'].map(leiden_patient_mapping)

In [None]:
top_k_matches_df['Patient_Cluster']

In [None]:
top_k_matches_df['Patient_Cluster'] = top_k_matches_df['Patient_Cluster'].astype(np.int64).astype('category')

In [None]:
mouse_category_map = dict(zip(top_k_matches_df['Mouse_Sample'], top_k_matches_df['Model']))

In [None]:
samples_adata.var

In [None]:
samples_adata[samples_adata.obs==sample,'Malignant Cell - Mesenchymal'].X

In [None]:
top_k_matches_df

In [None]:
top_k_matches_df['Mesenchymal_Proportion'] = top_k_matches_df['Human_Sample'].replace({sample:samples_adata[samples_adata.obs_names==sample,'Malignant Cell - Mesenchymal'].X[0][0] for sample in samples_adata.obs_names})

In [None]:
top_k_matches_df['Mesenchymal_Proportion'].fillna(0, inplace = True)

In [None]:
from collections import Counter

# Step 1: Count category connections for each human
human_category_counts = {}

for human in top_k_matches_df['Human_Sample'].unique():
    matches = top_k_matches_df[top_k_matches_df['Human_Sample'] == human]
    categories = matches['Mouse_Sample'].map(lambda m: mouse_category_map.get(m, 'unknown'))
    count = Counter(categories)
    primary_category = count.most_common(1)[0][0]
    human_category_counts[human] = primary_category

# Step 2: Sort humans: orthotopic -> endogenous -> unknown
humans_orthotopic = sorted([h for h, c in human_category_counts.items() if c == 'orthotopic'])
humans_endogenous = sorted([h for h, c in human_category_counts.items() if c == 'endogenous'])
humans_unknown = sorted([h for h, c in human_category_counts.items() if c not in ['orthotopic', 'endogenous']])
humans_sorted = humans_orthotopic + humans_endogenous + humans_unknown


In [None]:
human_mes_map = top_k_matches_df.drop_duplicates('Human_Sample').set_index('Human_Sample')['Mesenchymal_Proportion'].to_dict()


In [None]:
import matplotlib.pyplot as plt
import networkx as nx

# Rebuild graph (same as before)
G = nx.Graph()
G.add_nodes_from(humans_sorted, bipartite=0)
G.add_nodes_from(mice, bipartite=1)
humans_sorted = sorted(human_mes_map, key=human_mes_map.get, reverse=False)  # low → high

for _, row in top_k_matches_df.iterrows():
    G.add_edge(row['Human_Sample'], row['Mouse_Sample'], weight=row['Distance'])

# Layout: humans left, mice right
pos = {}
for i, node in enumerate(humans_sorted):
    pos[node] = (0, i)
for i, node in enumerate(mice):
    pos[node] = (1, i)

import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Normalize the mesenchymal values between 0 and 1
norm = mcolors.Normalize(vmin=min(human_mes_map.values()), vmax=max(human_mes_map.values()))
cmap = cm.viridis  # You can also use cm.plasma, cm.coolwarm, etc.

# Build node_colors
node_colors = []
for node in G.nodes():
    if node in humans_sorted:
        mes_val = human_mes_map.get(node, 0)
        node_colors.append(cmap(norm(mes_val)))
    else:
        category = mouse_category_map.get(node, 'unknown')
        node_colors.append(color_map.get(category, 'gray'))


# Edge widths
edges = G.edges(data=True)
weights = [1 / (d['weight'] + 1e-2) * 10 for _, _, d in edges]

# Draw
plt.figure(figsize=(12, 10))
nx.draw(
    G, pos, with_labels=False, node_size=10, width=weights,
    edge_color='gray', node_color=node_colors
)
plt.title("Human-Mouse Matching (Sorted by Mouse Category Connections)")
plt.axis("off")
plt.show()


In [None]:
samples_adata.obs["Model"] = samples_adata.obs_names.to_series().map(
    dict(zip(joint.obs.Sample_ID, joint.obs.Model))
)

samples_adata.obs["Species"] = samples_adata.obs_names.to_series().map(
    dict(zip(joint.obs.Sample_ID, joint.obs.Species))
)

In [None]:
samples_adata.obs.Model.value_counts()

In [None]:
sc.pp.neighbors(samples_adata, use_rep = "X", n_neighbors=30)

In [None]:
sc.tl.umap(samples_adata, min_dist = .05)

In [None]:
sc.pl.umap(samples_adata, color = "Species")
sc.pl.umap(samples_adata, color = "Patient_Cluster", cmap='tab20')
sc.pl.umap(samples_adata, color = "Model")

In [None]:
neighbors = samples_adata.uns['neighbors']


In [None]:
samples_adata

In [None]:
samples_adata

In [None]:
import scipy.sparse as sp
replace_map = {"nan": 'Human', "endogenous": "Mouse - Endogenous", "orthotopic": "Mouse - Orthotopic"}
samples_adata.obs['Model_complete'] = samples_adata.obs['Model'].astype(str).replace(replace_map).astype("category")

# Get original sparse connectivities matrix
neighbors = samples_adata.obsp['connectivities']
species = samples_adata.obs['Species'].values

# Get row/col indices of non-zero connections
row, col = neighbors.nonzero()

# Create mask: True only for edges between different species
mask = species[row] != species[col]

# Apply mask to keep only inter-species edges
data = neighbors.data[mask]
rows = row[mask]
cols = col[mask]

# Build new sparse matrix with filtered edges
connectivities_filtered = sp.csr_matrix((data, (rows, cols)), shape=neighbors.shape)

# Symmetrize the matrix (UMAP expects symmetric connectivities)
connectivities_filtered = connectivities_filtered.maximum(connectivities_filtered.T)

# Assign filtered connectivities into a copy of the original AnnData
adata_edges_filtered = samples_adata.copy()
adata_edges_filtered.obsp['connectivities'] = connectivities_filtered
adata_edges_filtered.uns['neighbors']['connectivities'] = connectivities_filtered  # keep consistent
adata_edges_filtered.obs

# Plot UMAP with filtered edges
sc.pl.umap(
    adata_edges_filtered,
    color=['Species', 'Model_complete'],
    edges=True,
    outline_width = (.05,.05),
    outline_color=('white', "black",),
    na_color = "white",
    add_outline = True,
    edges_width=0.25,
    alpha=.9,
    edges_color='black',
    frameon=False,
    show=False
)
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_sample_level_model.png", dpi = 300)



In [None]:
sc.tl.embedding_density(samples_adata, basis = "umap", groupby = "Model")

In [None]:
samples_adata

In [None]:
sc.pl.embedding_density(
    samples_adata, 
    basis = "umap", 
    key = "umap_density_Model",
    fg_dotsize=800, 
    bg_dotsize=350,
    frameon = False,
    show=False
)
plt.savefig("/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_pres_score_orthotopic_tme_densities.png", dpi = 300)


In [None]:
for var in samples_adata.var_names:
    sc.pl.umap(samples_adata, color =  var, vmax = 'p95', frameon = False)
    plt.savefig(f"/mnt/kkf2/Cell/AG-Saur/KKF2/Daniele/pdac_atlas_figures/figure6/umap_sample_{var}_abundance.png", dpi = 300)


In [None]:
from sklearn_ann.kneighbors.annoy import AnnoyTransformer  # noqa: F401
sc.pp.neighbors(joint, use_rep = 'X', transformer = AnnoyTransformer(500))

In [None]:
sc.tl.umap(joint)

In [None]:
sc.pl.umap(joint, color = ['Species','Level_3'])

## non overlapping cell

In [None]:
unique_cell_human = set(human.obs.Level_4.unique()) - set(mouse.obs.Level_4.unique())
unique_cell_mouse = set(mouse.obs.Level_4.unique()) - set(human.obs.Level_4.unique())


In [None]:
unique_cell_human.

In [None]:
unique_cell_mouse

# misc

In [None]:
sc.pl.umap(human, color = 'Level_3', legend_loc = 'on data' )#groups = [cell for cell in human.obs.Level_4 if 'Malignant' in cell ])

In [None]:
counts = mouse[mouse.obs.Level_4.str.contains('Malignant')].obs.groupby(['Model', 'Level_4']).size().unstack(fill_value=0)
proportions = counts.div(counts.sum(axis=1), axis=0)

# Plot
proportions.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.ylabel('Proportion of Malignant Cells')
plt.title('Normalized Malignant Cell Subtypes per Model')
plt.legend(title='Level_4', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
import yaml

with open('../../figures/config.yml', 'r') as f:
    config = yaml.safe_load(f)


In [None]:
malignant_cells = [cell for cell in human.obs['Level_4'].unique() if 'Malignant' in cell]
palette = sns.color_palette('tab20', n_colors=len(malignant_cells))

sc.pl.umap(
    human,
    color='Level_4',
    groups=malignant_cells,
    palette=palette,
    frameon=False,
    s=1,
    add_outline = True,
    #outline_color=('black', 'black'),
    na_color = 'white',
    na_in_legend=False
)


In [None]:
human.obs.iloc[:, -10:].to_csv('./max_presence_scores.csv', )

In [None]:
df=pd.read_csv('./max_presence_scores.csv', )

In [None]:
df.index = df['Unnamed: 0']

In [None]:
for col in df.columns:
    human.obs[col] = df[col]

In [None]:
human.obs[col] == df[col]

In [None]:
for obs in mouse.obs.columns:
    print(obs)
    print(mouse.obs[obs].unique())

In [None]:
mouse.obs.Genotype.unique().tolist()

In [None]:
human, emb

In [None]:
human[human.obs.Is_Core !='Core'].obs['Dataset_Barcode'] = human.obs.Dataset.astype(str) + '_' + human.obs_names

In [None]:
emb[emb.obs.Is_Core !='Core'].obs_names = emb[emb.obs.Is_Core !='Core'].obs_names.split('_')[1:].join('')

In [None]:
emb_extended = emb[emb.obs.Is_Core !='Core']
emb_extended_obs_names = [x[x.find('-1')-16:] for x in emb_extended.obs_names]
emb.obs_names.values[emb.obs.Is_Core != 'Core'] = emb_extended_obs_names

In [None]:
from rapidfuzz import process

In [None]:
human.n_obs - len(emb.obs_names.intersection(human.obs_names)) 

In [None]:
emb, human

In [None]:
human[human.obs.Is_Core !='Core'].obs['Dataset_Barcode'] = human.obs.Dataset.astype(str) + '_' + human.obs_names

In [None]:
import re
def clean_emb_name(name: str) -> str:
    match = re.search(r'([ACGT]+-\d)', name)
    return match.group(1) if match else name

In [None]:
from rapidfuzz import process
from rapidfuzz import fuzz
import pandas as pd
import time

matched_names = {}
c = 0

# Convert to a set for fast removal
non_intersecting_human_names = set(non_intersecting_human_names)

for name in non_intersecting_emb_names[:20]:
    start = time.time()

    if not non_intersecting_human_names:
        print("No more unmatched human names left.")
        break
    
    name_cleaned = clean_emb_name(name)
    match, score, _ = process.extractOne(name_cleaned, non_intersecting_human_names, scorer=fuzz.ratio)
    
    matched_names[name] = (match, score)

    # Remove matched human name so it's not used again
    non_intersecting_human_names.remove(match)

    c += 1
    if c % 10 == 0:
        print(f"Processed {c} names...in {time.time() - start:.2f} seconds.")
        print(len(non_intersecting_human_names), "human names left to match.")

# Convert to DataFrame
matched_df = pd.DataFrame.from_dict(matched_names, orient='index', columns=['BestMatch', 'Score'])


In [None]:
matched_df

In [None]:
import re
def clean_emb_name(name: str) -> str:
    match = re.search(r'([ACGT]+-\d)', name)
    return match.group(1) if match else name

In [None]:
clean_emb_name('Zhang_GSE197177_ACTATTCGTTAATCGC-1-1')


In [None]:
emb.obs['dt_bc'] = emb.obs_names

In [None]:
emb_ext = emb[emb.obs.Is_Core !='Core'].copy()

In [None]:
def clean_obs_names(obs_names, datasets):
    """
    Removes dataset prefix (with underscore) from obs_names
    Example:
    GSE211644_AAACCTGAGCTCCTTC-1_135-1 → AAACCTGAGCTCCTTC-1_135-1
    """
    cleaned = []
    for name in obs_names:
        # Remove everything before the first underscore (dataset ID)
        cleaned.append(name.split('_', 1)[-1])
    return cleaned

In [None]:
obs_names_cleaned = clean_obs_names(emb_ext.obs_names, emb_ext.obs.Dataset)

In [None]:
import numpy as np

# Boolean mask for rows to update
mask = emb.obs["Is_Core"] != "Core"

# Get the indices where the condition holds
target_indices = np.where(mask)[0]

# Get current obs_names as a list
obs_names = emb.obs_names.tolist()

# Clean only selected names
for idx in target_indices:
    name = obs_names[idx]
    obs_names[idx] = name.split('_', 1)[-1]  # Remove dataset prefix

# Assign updated names back
emb.obs_names = obs_names

In [None]:
len(human.obs_names.intersection(emb.obs_names))

In [None]:
set(human.obs_names) - set(emb.obs_names)

In [None]:
human.var.Manual_Genes.value_counts()

In [None]:
for i in mouse.obsm.keys():
    if 'scanvi' in i.lower():
        print(i)

In [None]:
human.obs['Dataset']

In [None]:
human.obsm.keys()

In [None]:
import numpy as np

a = mouse.obsm["scANVI_emb_final"]
b = mouse.obsm["scANVI_emb_final "]

same_shape = a.shape == b.shape
same_values = np.allclose(a, b)

print(f"Same shape? {same_shape}")
print(f"Same values? {same_values}")

In [None]:
genotype = []

for gt in mouse.obs.Genotype:
    if ';' not in gt:
        genotype.append('Unknown')
    else:
        genotype.append(gt)
mouse.obs['Genotype'] = genotype


In [None]:
mouse.obs['Genotype'].unique()

In [None]:
counts = human[human.obs.Level_4.str.contains('Malignant')].obs.groupby(['Treatment', 'Level_4']).size().unstack(fill_value=0)
proportions = counts.div(counts.sum(axis=1), axis=0)

# Plot
proportions.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.ylabel('Proportion of Malignant Cells')
plt.title('Normalized Malignant Cell Subtypes per Model')
plt.legend(title='Level_4', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()