In [None]:
import matplotlib
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
import os
import umap
import datashader as ds
import colorcet as cc
import igraph
import tqdm
from scipy import sparse
from scipy import stats
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import LatentDirichletAllocation
from statsmodels.stats.multitest import multipletests
from sklearn.cluster import KMeans
from scipy.spatial import Delaunay
import json
import itertools
import pynndescent
import time
from sklearn.svm import SVC
from sklearn import metrics

from matplotlib.collections import PolyCollection
from matplotlib.colors import ListedColormap

from dredFISH.Analysis import TissueGraph
from dredFISH.Visualization import Viz
from dredFISH.Utils.__init__plots import * 
from dredFISH.Utils import powerplots
from dredFISH.Utils import miscu
from dredFISH.Utils import tmgu
from dredFISH.Utils import basicu

import importlib
importlib.reload(Viz)
importlib.reload(TissueGraph)
importlib.reload(powerplots)

#### Load data

In [None]:
respath = '/bigstore/GeneralStorage/fangming/projects/dredfish/figures/'

In [None]:
basepth = '/bigstore/GeneralStorage/Data/dredFISH/DPNMF-FR_R1_4A_UC_R2_5C_2022Nov27_Dec12_strip_tol/DPNMF-FR_R1_4A_UC_R2_5C_2022Nov27_Section5_total'
!ls -alhtr $basepth
!head $basepth"/TMG.json"

In [None]:
df = pd.read_csv(
    os.path.join(basepth, "default_analysis.csv"), index_col=0)
df

In [None]:
TMG = TissueGraph.TissueMultiGraph(basepath=basepth, 
                                   redo=False, # load existing 
                                   quick_load_cell_obs=True,
                                  )

In [None]:
# spatial coordinates
layer = TMG.Layers[0]
XY = layer.XY
x, y = XY[:,0], XY[:,1]

cells = layer.adata.obs.index.values

N = layer.N
# measured basis
ftrs_mat = layer.feature_mat

# umap_mat = umap.UMAP(n_neighbors=30, min_dist=0.1).fit_transform(ftrs_mat)




# Lateral symmetry measure

In [None]:
def build_feature_graph_knnlite(ftrs_mat, k=15, metric='cosine'):
    """
    """
    N = len(ftrs_mat)
    
    # kNN graph
    knn = pynndescent.NNDescent(ftrs_mat,
                                n_neighbors=k,
                                metric=metric,
                                diversify_prob=1,
                                pruning_degree_multiplier=1.5,
                                )
    idx, _ = knn.neighbor_graph

    # to adj and to graph
    i = np.repeat(np.arange(N), k-1)
    j = idx[:,1:].reshape(-1,)
    adj_mat = sparse.coo_matrix((np.repeat(1, len(i)), (i,j)), shape=(N,N))
    G = tmgu.adjacency_to_igraph(adj_mat, directed=False, simplify=True)
    
    return G

In [None]:
# # from meta
# f = '/bigstore/GeneralStorage/fangming/projects/dredfish/data_dump/analysis_meta_Mar31.json'
# with open(f, 'r') as fh:
#     meta = json.load(fh)

In [None]:
df_h1 = df[df['hemi']==0].copy()
df_h2 = df[df['hemi']==1].copy()

# separate cells h1 and h2
cells_h1 = df.index[df['hemi'] == 0].values
cells_h2 = df.index[df['hemi'] == 1].values
print(df_h1.shape, df_h2.shape)

In [None]:
%%time
ftrs_mat_h1 = ftrs_mat[cells_h1]
G_h1 = build_feature_graph_knnlite(ftrs_mat_h1, k=15, metric='cosine')

ftrs_mat_h2 = ftrs_mat[cells_h2]
G_h2 = build_feature_graph_knnlite(ftrs_mat_h2, k=15, metric='cosine')

In [None]:
# clustering half-and-half
resolutions = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1,2,5,10]
clst_mat_h1 = []
clst_mat_h2 = []
for i, r in tqdm.tqdm(enumerate(resolutions)):
    #
    types_h1 = miscu.leiden(G_h1, cells_h1, resolution=r)
    types_h2 = miscu.leiden(G_h2, cells_h2, resolution=r)
    
    # add to a df 
    df_h1[f'type_r{r}'] = np.char.add('t', np.array(types_h1).astype(str))
    df_h2[f'type_r{r}'] = np.char.add('t', np.array(types_h2).astype(str))
    clst_mat_h1.append(types_h1)
    clst_mat_h2.append(types_h2)
    
    
clst_mat_h1 = np.array(clst_mat_h1)
clst_mat_h2 = np.array(clst_mat_h2)

In [None]:
for i, r in enumerate(resolutions):
    hue = f'type_r{r}'
    output = None
    powerplots.plot_type_spatial_umap(df_h1, hue, output=output)
    powerplots.plot_type_spatial_umap(df_h2, hue, output=output)
    
    # break

In [None]:
# # matching the other half (across modality)


# N = len(ftrs_mat)

# # kNN graph
# knn = pynndescent.NNDescent(ftrs_mat_h1,
#                             n_neighbors=15,
#                             # metric='cosine',
#                             metric='euclidean',
#                             diversify_prob=1,
#                             pruning_degree_multiplier=1.5,
#                             )
# idx, _ = knn.neighbor_graph

# # # to adj and to graph
# # i = np.repeat(np.arange(N), k-1)
# # j = idx[:,1:].reshape(-1,)
# # adj_mat = sparse.coo_matrix((np.repeat(1, len(i)), (i,j)), shape=(N,N))
# # G = tmgu.adjacency_to_igraph(adj_mat, directed=False, simplify=True)


In [None]:
def mapping_types(
    ftrs_source,  
    types_source,
    ftrs_target,
    ):
    """
    Uses SVM -- a bit slow
    can we get faster implementation using bi-partite graph?
    """
    if len(np.unique(types_source)) == 1:
        return np.repeat(np.unique(types_source), len(ftrs_target))
    
    model = SVC(C=1, kernel='rbf')
    model.fit(ftrs_source, types_source)
    types_target = model.predict(ftrs_target)
    
    return types_target

In [None]:
%%time


# 20 min long (long for fine resolution)
for r in tqdm.tqdm(resolutions):
    # predict paired labels for h2 cells using h1 labels
    types_h1 = df_h1[f'type_r{r}'].values
    ptypes_h2 = mapping_types(ftrs_mat_h1, types_h1, ftrs_mat_h2)
    df_h2[f'ptype_r{r}'] = ptypes_h2
    
    # reverse case
    types_h2 = df_h2[f'type_r{r}'].values
    ptypes_h1 = mapping_types(ftrs_mat_h2, types_h2, ftrs_mat_h1)
    df_h1[f'ptype_r{r}'] = ptypes_h1
    

In [None]:
for i, r in enumerate(resolutions):
    output = None
    hue = f'type_r{r}'
    powerplots.plot_type_spatial_umap(df_h1, hue, output=output)
    hue = f'ptype_r{r}'
    powerplots.plot_type_spatial_umap(df_h2, hue, output=output)
    
    # break

In [None]:
%%time
# save the df temporarily so we can develop off of it
output = os.path.join(basepth, 'lateral_symm_analysis_h1.csv')
df_h1.to_csv(output, header=True, index=True)
output = os.path.join(basepth, 'lateral_symm_analysis_h1.csv')
df_h2.to_csv(output, header=True, index=True)

!head $output | cut -d , -f 25-50

In [None]:
# organize
for r in resolutions:
    # h1
    col = f'jtype_r{r}_h1'
    df[col] = ''
    df.loc[df_h1.index, col] = df_h1[f'type_r{r}']
    df.loc[df_h2.index, col] = df_h2[f'ptype_r{r}']
    
    # h2
    col = f'jtype_r{r}_h2'
    df[col] = ''
    df.loc[df_h2.index, col] = df_h2[f'type_r{r}']
    df.loc[df_h1.index, col] = df_h1[f'ptype_r{r}']

In [None]:
for i, r in enumerate(resolutions):
    output = None
    hue = f'jtype_r{r}_h1'
    powerplots.plot_type_spatial_umap(df, hue, output=output)
    # hue = f'jtype_r{r}'
    # powerplots.plot_type_spatial_umap(df_h2, hue, output=output)

In [None]:
for i, r in enumerate(resolutions):
    output = None
    hue = f'type_r{r}'
    powerplots.plot_type_spatial_umap(df, hue, output=output)
    # hue = f'jtype_r{r}'
    # powerplots.plot_type_spatial_umap(df_h2, hue, output=output)

# Lateral balance
- 

In [None]:
# quantify number of cells from each half
res = []
for r in resolutions:
    _res = {'r': r}
    
    col = f'jtype_r{r}_h1'
    n_t = len(df[col].unique())
    cmat = df.groupby([col, 'hemi']).size().unstack()
    cmat = cmat.divide(cmat.sum(axis=1), axis=0)
    entpy = (-cmat.iloc[:,0]*np.log2(cmat.iloc[:,0])-cmat.iloc[:,1]*np.log2(cmat.iloc[:,1])).mean()
    _res['n_t_h1'] = n_t
    _res['entpy_h1'] = entpy
    
    col = f'jtype_r{r}_h2'
    n_t = len(df[col].unique())
    cmat = df.groupby([col, 'hemi']).size().unstack()
    cmat = cmat.divide(cmat.sum(axis=1), axis=0)
    entpy = (-cmat.iloc[:,0]*np.log2(cmat.iloc[:,0])-cmat.iloc[:,1]*np.log2(cmat.iloc[:,1])).mean()
    _res['n_t_h2'] = n_t
    _res['entpy_h2'] = entpy
    
    col = f'type_r{r}'
    n_t = len(df[col].unique())
    cmat = df.groupby([col, 'hemi']).size().unstack()
    cmat = cmat.divide(cmat.sum(axis=1), axis=0)
    entpy = (-cmat.iloc[:,0]*np.log2(cmat.iloc[:,0])-cmat.iloc[:,1]*np.log2(cmat.iloc[:,1])).mean()
    _res['n_t'] = n_t
    _res['entpy'] = entpy
    
    res.append(_res)
    
res = pd.DataFrame(res)
res

In [None]:
cmat = df.groupby([col, 'hemi']).size().unstack()
cmat

In [None]:
plt.plot(cmat[0])
plt.plot(cmat[1])

In [None]:
plt.plot(df['type_r10'].value_counts())
# resolutions

In [None]:
len(df)

In [None]:
df['type_r10', 'hemi']

In [None]:
fig, ax = plt.subplots()
ax.plot(res['n_t'], res['entpy'], '-o', markersize=5, label='Plain', color='k')
ax.plot(res['n_t_h1'], res['entpy_h1'], '-o', markersize=5, label='Paired_h1')
ax.plot(res['n_t_h2'], res['entpy_h2'], '-o', markersize=5, label='Paired_h2')
ax.set_xlabel('Number of cell types')
ax.set_ylabel('Lateral entropy')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

In [None]:
propts_h1 = []
propts_h2 = []
propts = []
for r in resolutions:
    col = f'jtype_r{r}_h1'
    n_t = len(df[col].unique())
    cmat = df.groupby([col, 'hemi']).size().unstack()
    cmat = cmat.divide(cmat.sum(axis=1), axis=0)
    fracs = np.sort(cmat.iloc[:,0])
    propts_h1.append(fracs)
    
    col = f'jtype_r{r}_h2'
    n_t = len(df[col].unique())
    cmat = df.groupby([col, 'hemi']).size().unstack()
    cmat = cmat.divide(cmat.sum(axis=1), axis=0)
    fracs = np.sort(cmat.iloc[:,0])
    propts_h2.append(fracs)

    col = f'type_r{r}'
    n_t = len(df[col].unique())
    cmat = df.groupby([col, 'hemi']).size().unstack()
    cmat = cmat.divide(cmat.sum(axis=1), axis=0)
    fracs = np.sort(cmat.iloc[:,0])
    propts.append(fracs)

In [None]:
ny = len(resolutions)
fig, axs = plt.subplots(ny, 1, figsize=(8, 5*ny))
for i, r in enumerate(resolutions):
    ax = axs[i]
    fracs = propts[i]
    ax.plot(fracs, '-o', markersize=5, label='Plain', color='k')
    
    fracs = propts_h1[i]
    ax.plot(fracs, '-o', markersize=5, label='Paired_h1')
    
    fracs = propts_h2[i]
    ax.plot(fracs, '-o', markersize=5, label='Paired_h2')
    
    
plt.show()

In [None]:
# ARI adjusted rand score

In [None]:
aris = []
nts = []
for r in resolutions:
    types_h1 = df[f'jtype_r{r}_h1'].values
    types_h2 = df[f'jtype_r{r}_h2'].values
    
    nt_h1 = len(np.unique(types_h1))
    nt_h2 = len(np.unique(types_h2))
    
    ari = metrics.adjusted_rand_score(types_h1, types_h2)
    aris.append(ari)
    nts.append((nt_h1, nt_h2))

In [None]:
fig, ax = plt.subplots()
ax.errorbar([(n[0]+n[1])/2 for n in nts], 
            aris, 
            xerr=[np.abs(n[1]-n[0])/2 for n in nts], 
            capsize=3,
           )
ax.set_xlabel('Number of cell types')
ax.set_ylabel('Adjusted Rand Index (ARI)')
plt.show()


# Confusion matrix

In [None]:
N = layer.N
r = 0.5
types_h1 = df[f'jtype_r{r}_h1'].values
types_h2 = df[f'jtype_r{r}_h2'].values
confmat = metrics.confusion_matrix(types_h1, types_h2)

In [None]:
confmat2, rows, cols = basicu.diag_matrix(confmat)
rows, cols

In [None]:
fig, ax = plt.subplots()
sns.heatmap(confmat2, ax=ax)
ax.set_aspect('equal')
plt.show()

In [None]:
fig, ax = plt.subplots()
sns.heatmap(confmat2/confmat2.sum(axis=1).reshape(-1,1), ax=ax)
ax.set_aspect('equal')
plt.show()

In [None]:
a = confmat2/confmat2.sum(axis=1).reshape(-1,1)
b = confmat2/confmat2.sum(axis=0).reshape(1,-1)

In [None]:
fig, ax = plt.subplots()
sns.heatmap(a, ax=ax)
ax.set_aspect('equal')
plt.show()

In [None]:
fig, ax = plt.subplots()
sns.heatmap(b, ax=ax)
ax.set_aspect('equal')
plt.show()