# 3. Cell type identification of RIBOmap

2022-10-23

In [None]:
# Import Packages

%load_ext autoreload
%autoreload 2

import os
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import anndata as ad
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from anndata import AnnData
from natsort import natsorted
from tqdm.notebook import tqdm

# Customized packages
import starmap.sc_util as su
# test()

## Set path

In [None]:
# Set path
base_path = 'Z:/Data/Analyzed/2022-09-05-Hu-Tissue/'

input_path = os.path.join(base_path, 'input')

out_path = os.path.join(base_path, 'output')
if not os.path.exists(out_path):
    os.mkdir(out_path)
    
fig_path = os.path.join(base_path, 'figures')
if not os.path.exists(fig_path):
    os.mkdir(fig_path)

sc.settings.figdir = fig_path

In [None]:
# laod combined file
adata = sc.read_h5ad(os.path.join(out_path, '2022-10-25-Brain-RIBOmap-combined-3mad-harmony.h5ad'))
# adata = sc.read_h5ad(os.path.join(out_path, '2022-10-26-Brain-combined-3mad-ct-bk1.h5ad'))
adata

In [None]:
# laod backup file
# adata = sc.read_h5ad(os.path.join(out_path, '2022-11-06-Brain-combined-3mad-ct-bk3.h5ad'))
adata = sc.read_h5ad(os.path.join(out_path, '2022-11-20-Brain-RIBOmap-3mad-ct-final.h5ad'))
adata

## Clustering

In [None]:
%%time
# Run leiden cluster
cluster_resolution = 1.5
sc.tl.leiden(adata, resolution = cluster_resolution)

In [None]:
# Plot UMAP with cluster labels 
min_dist = 0.0001
spread = 5
sc.tl.umap(adata, min_dist=min_dist, spread=spread)
sc.pl.umap(adata, color='leiden')

In [None]:
# Save log
n_clusters = adata.obs['leiden'].unique().shape[0]
with open(f'{fig_path}/log_level_2.txt', 'w') as f:
    f.write(f"""Number of neighbor: 50
Number of PC: 30
Resolution: {cluster_resolution}
Min-distance: {min_dist}
Spread: {spread}
Number of clusters: {n_clusters}""")

# save embeddings
np.savetxt(f'{fig_path}/embedding_level_2_umap.csv', adata.obsm['X_umap'], delimiter=",")

In [None]:
# Get colormap
cluster_pl = sns.color_palette("husl", n_clusters)
cluster_cmap = ListedColormap(cluster_pl.as_hex())
sns.palplot(cluster_pl)

In [None]:
# Plot UMAP with cluster labels w/ new color
fig, ax = plt.subplots(figsize=(10, 7))
sc.pl.umap(adata, color='leiden', legend_loc='on data', palette=cluster_pl,
           legend_fontsize=10, legend_fontoutline=2, frameon=False, ax=ax)

In [None]:
n_clusters = adata.obs['leiden'].unique().shape[0]

# Get markers for each cluster
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.15, max_out_group_fraction=0.85)

In [None]:
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
clustering_out_path = os.path.join(fig_path, f'{date}-clustering')
if not os.path.exists(clustering_out_path):
    os.mkdir(clustering_out_path)

In [None]:
# Save log
with open(f'{clustering_out_path}/log_r_{cluster_resolution}.txt', 'w') as f:
    f.write(f"""
Resolution: {cluster_resolution}
Number of clusters: {n_clusters}
UMAP min_dist: {min_dist}
UMAP spread: {spread}""")
    
# save results
with pd.ExcelWriter(os.path.join(clustering_out_path, f'clustering_markers_r_{cluster_resolution}.xlsx'), mode='w') as writer:  
    for current_cell_type in adata.obs.leiden.cat.categories:
        current_df = sc.get.rank_genes_groups_df(adata, group=current_cell_type, key='rank_genes_groups')
        current_df.to_excel(writer, sheet_name=f'{current_cell_type}')
        
with pd.ExcelWriter(os.path.join(clustering_out_path, f'clustering_markers_r_{cluster_resolution}_filtered.xlsx'), mode='w') as writer:  
    for current_cell_type in adata.obs.leiden.cat.categories:
        current_df = sc.get.rank_genes_groups_df(adata, group=current_cell_type, key='rank_genes_groups_filtered')
        current_df.to_excel(writer, sheet_name=f'{current_cell_type}')
        
# save obs
adata.obs.to_csv(os.path.join(clustering_out_path, f'obs_r_{cluster_resolution}.csv'))

In [None]:
# check cluster distribution
import matplotlib.patches as mpatches
leiden_df = pd.DataFrame(adata.obs.groupby('leiden')['protocol-replicate'].value_counts().values)
leiden_df.columns = ['counts']
leiden_df['leiden'] = [i[0] for i in adata.obs.groupby('leiden')['protocol-replicate'].value_counts().index]
leiden_df['protocol-replicate'] = [i[1] for i in adata.obs.groupby('leiden')['protocol-replicate'].value_counts().index]
sum_counts = leiden_df.groupby('leiden')['counts'].sum().to_dict()
leiden_df['percentage'] = [leiden_df.loc[i, 'counts'] / sum_counts[leiden_df.loc[i, 'leiden']] * 100 for i in range(leiden_df.shape[0])]

s_df = leiden_df.loc[leiden_df['protocol-replicate'] == 'RIBOmap-rep1', :]
s_df['percentage'] = 100
r_df = leiden_df.loc[leiden_df['protocol-replicate'] == 'RIBOmap-rep2', :]

fig, ax = plt.subplots(figsize=(15, 10))
bar1 = sns.barplot(x="leiden",  y="percentage", data=s_df, color='#fcad03')
bar2 = sns.barplot(x="leiden", y="percentage", data=r_df, color='#03fc35')

# add legend
top_bar = mpatches.Patch(color='#fcad03', label='RIBOmap-rep1')
middle_bar = mpatches.Patch(color='#03fc35', label='RIBOmap-rep2')
plt.legend(handles=[top_bar, middle_bar])

# show the graph
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
sns.barplot(x='leiden', y='percentage', hue='protocol-replicate', data=leiden_df)
plt.show()

In [None]:
plt.subplots(figsize=(10, 5))
sns.boxplot(x='leiden', y='total_counts', data=adata.obs)

In [None]:
test = adata.obs.groupby(['leiden']).median()
test['leiden'] = test.index.values
plt.subplots(figsize=(10, 5))
sns.barplot(x='leiden', y='total_counts', data=test)

In [None]:
test = pd.DataFrame(adata.obs['leiden'].value_counts())
test.columns = ['count']
test['leiden'] = test.index.values
plt.subplots(figsize=(10, 5))
sns.barplot(x='leiden', y='count', data=test)

### Spatial map

In [None]:
current_sample = 'RIBOmap-rep2'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue='leiden', 
                    palette=cluster_pl,
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)
g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
current_sample = 'RIBOmap-rep1'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue='leiden', 
                    palette=cluster_pl,
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)
g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
# plot summary plot for each cluster

for i, current_cluster in enumerate(tqdm(sorted(adata.obs['leiden'].unique()))):
    
    # get dfs 
    df1 = adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep1', :]
    df2 = adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep2', :]

    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(13, 11))
    axs = axs.flatten()


    # plot1
    g1 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=df1, 
                        s=5,
                        ax=axs[0])

    g1.set_title('RIBOmap-rep1')
    g1.invert_xaxis()
    g1.axes.xaxis.set_visible(False)
    g1.axes.yaxis.set_visible(False)


    h1 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df1.loc[df1['leiden'] == current_cluster, ], 
                        s=5,
                        ax=axs[0])

    # plot2
    g2 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=df2, 
                        s=5,
                        ax=axs[1])

    g2.set_title('RIBOmap-rep2')
    g2.invert_yaxis()
    g2.axes.xaxis.set_visible(False)
    g2.axes.yaxis.set_visible(False)

    h2 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df2.loc[df2['leiden'] == current_cluster, ], 
                        s=5,
                        ax=axs[1])


    # umap1
    ax = sc.pl.umap(adata, show=False, color=None, alpha=1, size=(120000 / adata.n_obs), ax=axs[2], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(adata[(adata.obs["leiden"] == current_cluster) & (adata.obs['protocol-replicate'] == 'RIBOmap-rep1')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(120000 / adata.n_obs),
           title='', show=False, palette=sns.color_palette([adata.uns['leiden_colors'][int(current_cluster)]]))

    # umap2
    ax = sc.pl.umap(adata, show=False, color=None, alpha=1, size=(120000 / adata.n_obs), ax=axs[3], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(adata[(adata.obs["leiden"] == current_cluster) & (adata.obs['protocol-replicate'] == 'RIBOmap-rep2')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(120000 / adata.n_obs),
           title='', show=False, palette=sns.color_palette([adata.uns['leiden_colors'][int(current_cluster)]]))

    plt.savefig(os.path.join(clustering_out_path, 'summary', f'cluster_{current_cluster}.jpeg'))

## Gene spatial map

In [None]:
# Gene spatial map
# sns.set_style("dark")
sns.set(rc={'figure.facecolor':'black', 'axes.facecolor':'black'})

current_gene = 'Qk'
current_sample = 'RIBOmap-rep2'
current_gene_vector = adata.raw[adata.obs['protocol-replicate'] == current_sample, current_gene].X.flatten()

# current_gene_vector = adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, 'total_counts']

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue=current_gene_vector, 
                    palette='viridis',
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    alpha=.8,
                    linewidth=0,
                    legend=False,
                    ax=ax)
g.set_title(f'{current_sample} - {current_gene}', color='w')
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
sns.despine(left=True, bottom=True)

In [None]:
# Gene spatial map
# sns.set_style("dark")
sns.set(rc={'figure.facecolor':'black', 'axes.facecolor':'black'})

current_gene = 'Qk'
current_sample = 'STARmap-rep2'
current_gene_vector = adata.raw[adata.obs['protocol-replicate'] == current_sample, current_gene].X.flatten()

current_gene_vector = adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, 'total_counts']

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue=current_gene_vector, 
                    palette='viridis',
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    alpha=.8,
                    linewidth=0,
                    legend=False,
                    ax=ax)
g.set_title(f'{current_sample} - {current_gene}', color='w')
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
sns.despine(left=True, bottom=True)

In [None]:
# Gene spatial map
# sns.set_style("dark")
sns.set(rc={'figure.facecolor':'black', 'axes.facecolor':'black'})

current_gene = 'Qk'
current_sample = 'RIBOmap-rep1'
current_gene_vector = adata.raw[adata.obs['protocol-replicate'] == current_sample, current_gene].X.flatten()

current_gene_vector = adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, 'total_counts']

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue=current_gene_vector, 
                    palette='viridis',
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    alpha=.8,
                    linewidth=0,
                    legend=False,
                    ax=ax)
g.set_title(f'{current_sample} - {current_gene}', color='w')
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
sns.despine(left=True, bottom=True)

In [None]:
# output figures based on zefang's list 
zf_df = pd.read_csv(os.path.join(base_path, 'other-datasets', 'interest_t1_2.csv'), header=None)
zf_df.columns = ['Gene']

In [None]:
# geen counts > 5000
adata.var['total_counts'] = adata.layers['raw'].sum(axis=0)
adata.var['RIBOmap_counts'] = adata[adata.obs['protocol'] == 'RIBOmap', :].layers['raw'].sum(axis=0)
adata.var['STARmap_counts'] = adata[adata.obs['protocol'] == 'STARmap', :].layers['raw'].sum(axis=0)
genes_df = adata.var.loc[(adata.var['RIBOmap_counts'] > 5000) & (adata.var['STARmap_counts'] > 5000), :]

In [None]:
sns.set(rc={'figure.facecolor':'black', 'axes.facecolor':'black'})
# for current_gene in tqdm(zf_df['Gene']):
    
for current_gene in tqdm(genes_df.index):
    
    # get dfs 
    ribo_df = adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep2', :]
    star_df = adata.obs.loc[adata.obs['protocol-replicate'] == 'STARmap-rep2', :]
    
    ribo_vector = adata.raw[adata.obs['protocol-replicate'] == 'RIBOmap-rep2', current_gene].X.flatten()
    star_vector = adata.raw[adata.obs['protocol-replicate'] == 'STARmap-rep2', current_gene].X.flatten()
    vmax = max(ribo_vector.max(), star_vector.max())

    # construct plots
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(30, 20))
    axs = axs.flatten()

    # ribo plot
    g1 = sns.scatterplot(x='column', y='row', hue=ribo_vector, 
                        palette='viridis',
                        data=ribo_df, 
                        s=10,
                        alpha=.8,
                        linewidth=0,
                        legend=False, vmax=vmax,
                        ax=axs[0])
    g1.set_title(f'RIBOmap-rep2 - {current_gene}', color='w')
    g1.invert_yaxis()
    g1.axes.xaxis.set_visible(False)
    g1.axes.yaxis.set_visible(False)
    # sns.despine(left=True, bottom=True)


    # star plot
    g2 = sns.scatterplot(x='column', y='row', hue=star_vector, 
                        palette='viridis',
                        data=star_df, 
                        s=10,
                        alpha=.8,
                        linewidth=0,
                        legend=False, vmax=vmax,
                        ax=axs[1])
    g2.set_title(f'STARmap-rep2 - {current_gene}', color='w')
    g2.invert_yaxis()
    g2.axes.xaxis.set_visible(False)
    g2.axes.yaxis.set_visible(False)

    plt.tight_layout()
    # plt.show()
    plt.savefig(os.path.join(fig_path, 'gene-expression-spatial-map-counts-5000', f'{current_gene}.png'))
    plt.clf()
    
sns.reset_orig()

In [None]:
# plot summary plot for each cluster
from tqdm.notebook import tqdm

for i, current_cluster in enumerate(tqdm(sorted(sdata.obs['leiden'].unique()))):
    
    # get dfs 
    df1 = sdata.obs.loc[sdata.obs['protocol-replicate'] == 'RIBOmap-rep1', :]
    df2 = sdata.obs.loc[sdata.obs['protocol-replicate'] == 'RIBOmap-rep2', :]
    df3 = sdata.obs.loc[sdata.obs['protocol-replicate'] == 'STARmap-rep2', :]

    fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(16, 11))
    axs = axs.flatten()


    # plot1
    g1 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=df1, 
                        s=5,
                        ax=axs[0])

    g1.set_title('RIBOmap-rep1')
    g1.invert_xaxis()
    g1.axes.xaxis.set_visible(False)
    g1.axes.yaxis.set_visible(False)


    h1 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df1.loc[df1['leiden'] == current_cluster, ], 
                        s=5,
                        ax=axs[0])

    # plot2
    g2 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=df2, 
                        s=5,
                        ax=axs[1])

    g2.set_title('RIBOmap-rep2')
    g2.invert_yaxis()
    g2.axes.xaxis.set_visible(False)
    g2.axes.yaxis.set_visible(False)

    h2 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df2.loc[df2['leiden'] == current_cluster, ], 
                        s=5,
                        ax=axs[1])

    # plot3
    g3 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=df3, 
                        s=5,
                        ax=axs[2])

    g3.set_title('STARmap-rep2')
    g3.invert_yaxis()
    g3.axes.xaxis.set_visible(False)
    g3.axes.yaxis.set_visible(False)

    h3 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df3.loc[df3['leiden'] == current_cluster, ], 
                        s=5,
                        ax=axs[2])

    # umap1
    ax = sc.pl.umap(sdata, show=False, color=None, alpha=1, size=(120000 / sdata.n_obs), ax=axs[3], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(sdata[(sdata.obs["leiden"] == current_cluster) & (sdata.obs['protocol-replicate'] == 'RIBOmap-rep1')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(120000 / sdata.n_obs),
           title='', show=False, palette=sns.color_palette([sdata.uns['leiden_colors'][int(current_cluster)]]))

    # umap2
    ax = sc.pl.umap(sdata, show=False, color=None, alpha=1, size=(120000 / sdata.n_obs), ax=axs[4], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(sdata[(sdata.obs["leiden"] == current_cluster) & (sdata.obs['protocol-replicate'] == 'RIBOmap-rep2')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(120000 / sdata.n_obs),
           title='', show=False, palette=sns.color_palette([sdata.uns['leiden_colors'][int(current_cluster)]]))

    # umap3
    ax = sc.pl.umap(sdata, show=False, color=None, alpha=1, size=(120000 / sdata.n_obs), ax=axs[5], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(sdata[(sdata.obs["leiden"] == current_cluster) & (sdata.obs['protocol-replicate'] == 'STARmap-rep2')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(120000 / sdata.n_obs),
           title='', show=False, palette=sns.color_palette([sdata.uns['leiden_colors'][int(current_cluster)]]))

    plt.show()
    # plt.savefig(os.path.join(fig_path, 'r_2.2_summary', f'cluster_{current_cluster}.jpeg'))

## Level 2 clustering

### assign label

In [None]:
# Print markers 
markers = []
temp = pd.DataFrame(adata.uns['rank_genes_groups_filtered']['names']).head(15)
for i in range(temp.shape[1]):
    curr_col = temp.iloc[:, i].to_list()
    markers = markers + curr_col
    # print(i, curr_col)
    print(i)
    for j in curr_col:
        print(j, end=' ')
    print('')
    
sc.pl.dotplot(adata, ['Slc17a7', 'Gad1', 'Gad2', 'Plp1', 'Gfap', 'Ctss', 'Bsg', 'Vtn'], 'leiden', dendrogram=False, standard_scale='group')

In [None]:
# create backup for leiden label
adata.obs['orig_leiden'] = adata.obs['leiden'].values

adata.obs['level_1'] = adata.obs['leiden'].values
adata.obs['level_2'] = adata.obs['leiden'].values
adata.obs['level_3'] = adata.obs['leiden'].values

In [None]:
# Change cluster label to cell type label
transfer_dict_l1 = {}
transfer_dict_l2 = {}
transfer_dict_l3 = {}

# Level_1
level_1_list = [
    'Neuronal cell', #0
    'Neuronal cell', #1
    'Neuronal cell', #2
    'Glia', #3
    'Glia', #4
    'Glia', #5
    'Glia', #6
    'Glia', #7
    'Neuronal cell', #8
    'Neuronal cell', #9
    'Neuronal cell', #10
    'Glia', #11
    'Glia', #12
    'Neuronal cell', #13
    'Neuronal cell', #14
    'Glia', #15
    'Glia', #16
    'Neuronal cell', #17
    'Glia', #18
    'Glia', #19
    'Neuronal cell', #20
    'Neuronal cell', #21
    'Glia', #22
]


# Level_2
level_2_list = ['Telencephalon projecting neurons', #0
                'Telencephalon projecting neurons', #1 
                'Cholinergic, monoaminergic and peptidergic neurons', #2
                'Oligodendrocyte', #3
                'Astrocyte', #4
                'Oligodendrocyte', #5
                'Vascular cells', #6
                'Astrocyte', #7
                'Di/Mesencephalon neurons', #8 
                'Telencephalon projecting neurons', #9
                'Telencephalon interneurons', #10
                'Vascular cells', #11
                'Microglia', #12
                'Telencephalon projecting neurons', #13 
                'Telencephalon projecting neurons', #14
                'Oligodendrocytes precursor cell', #15
                'Astroependymal cells', #16
                'Cholinergic, monoaminergic and peptidergic neurons', #17
                'Astroependymal cells', #18 
                'Perivascular macrophages', #19
                'Telencephalon projecting neurons', #20 
                'Telencephalon projecting neurons', #21
                'Perivascular macrophages', #22         
]

# Level_3
level_3_list = ['NA', #0
                'NA', #1 
                'NA', #2
                'Oligo1', #3
                'NA', #4
                'Oligo2', #5
                'NA', #6
                'NA', #7
                'NA', #8 
                'NA', #9
                'NA', #10
                'NA', #11
                'Micro', #12
                'NA', #13 
                'NA', #14
                'OPC', #15
                'EPEN', #16
                'NA', #17
                'CHOR', #18 
                'PVM1', #19
                'NA', #20 
                'NA', #21
                'PVM2', #22              
]


# construct transfer dict
for i in sorted(adata.obs['leiden'].unique()):
    transfer_dict_l1[i] = level_1_list[int(i)]
    transfer_dict_l2[i] = level_2_list[int(i)]
    transfer_dict_l3[i] = level_3_list[int(i)]

In [None]:
# Assign cell type to sdata
adata.obs = adata.obs.replace({'level_1': transfer_dict_l1})
adata.obs = adata.obs.replace({'level_2': transfer_dict_l2})
adata.obs = adata.obs.replace({'level_3': transfer_dict_l3})

### classify mix cells

In [None]:
# calculate the distance between each cell and its leiden cluster center
adata.obs['distance2centroid_leiden'] = 9999

# find centroid of each leiden cluster
from sklearn.neighbors import NearestCentroid
from scipy.spatial.distance import cdist
clf = NearestCentroid(shrink_threshold=None)
clf.fit(adata.obsm['X_umap'], adata.obs.leiden)

# plot cluster center
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(adata, color='leiden', ax=ax, show=False, legend_loc='on data')
ax.scatter(clf.centroids_[:, 0], clf.centroids_[:, 1], s=10, c='r')
plt.show()

for i, current_label in enumerate(sorted(adata.obs['leiden'].cat.categories)):
    current_centroid = clf.centroids_[i, :].reshape([1,2])
    
    # check centroid location 
    # fig, ax = plt.subplots(figsize=(10,7))
    # sc.pl.umap(adata, color='leiden', ax=ax, show=False, legend_loc='on data', title=f'cluster-{current_label}')
    # ax.scatter(current_centroid[:, 0], current_centroid[:, 1], s=10, c='r')
    # plt.show()

    # calculate distance 
    dm = cdist(adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'], current_centroid)
    adata.obs.loc[adata.obs['leiden'] == current_label, 'distance2centroid_leiden'] = dm

    # plot distance distribution
    fig, ax = plt.subplots(figsize=(5,3))
    sns.histplot(dm)
    plt.title(f'cluster-{current_label}')
    plt.show()
    
    # visualize distance calculation 
    fig, axs = plt.subplots(figsize=(20,7), nrows=1, ncols=2)
    sns.scatterplot(adata.obsm['X_umap'][:, 0], adata.obsm['X_umap'][:, 1], color='#1111', s=1, ax=axs[0])
    sns.scatterplot(adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'][:, 0], adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'][:, 1], s=1, legend=None, ax=axs[0])
    axs[0].scatter(current_centroid[:, 0], current_centroid[:, 1], s=10, c='r')

    sns.scatterplot(adata.obsm['X_umap'][:, 0], adata.obsm['X_umap'][:, 1], color='#1111', s=1, ax=axs[1])
    sns.scatterplot(adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'][:, 0], adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'][:, 1], 
                    hue=adata.obs.loc[adata.obs['leiden'] == current_label, 'distance2centroid_leiden'], palette='viridis', s=1, legend=None, ax=axs[1])
    axs[1].scatter(current_centroid[:, 0], current_centroid[:, 1], s=10, c='r')
    plt.show()

    

In [None]:
# classify mix cells 
manual_threshold = [
    10, # 0
    7, # 1
    7, # 10
    6, # 11
    6, # 12
    4, # 13
    4, # 14
    5, # 15
    4, # 16
    5, # 17
    5, # 18
    5, # 19
    10, # 2
    3, # 20
    5, # 21
    4, # 22
    7, # 3
    9.3, # 4?
    10, # 5
    7, # 6
    6, # 7
    6, # 8
    10, # 9
]
adata.obs['is_mix'] = 'False'

# plot overall distance distribution
sns.distplot(adata.obs['distance2centroid_leiden'])

for i, current_label in enumerate(sorted(adata.obs['leiden'].cat.categories)):
    current_centroid = clf.centroids_[i, :].reshape([1,2])
    
    # plot distance distribution of current cluster
    fig, ax = plt.subplots(figsize=(5,3))
    sns.histplot(adata.obs.loc[adata.obs['leiden'] == current_label, 'distance2centroid_leiden'])
    ax.axvline(x=manual_threshold[i], c='r')
    plt.title(f'cluster-{current_label}')
    plt.show()
    
    adata.obs.loc[(adata.obs['leiden'] == current_label) & (adata.obs['distance2centroid_leiden'] > manual_threshold[i]), 'is_mix'] = 'True'
    adata.obs['is_mix'] = adata.obs['is_mix'].astype('category')
    
    fig, axs = plt.subplots(figsize=(20,7), nrows=1, ncols=2)
    sns.scatterplot(adata.obsm['X_umap'][:, 0], adata.obsm['X_umap'][:, 1], color='#1111', s=1, ax=axs[0])
    sns.scatterplot(adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'][:, 0], adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'][:, 1], s=1, legend=None, ax=axs[0])
    axs[0].scatter(current_centroid[0, 0], current_centroid[0, 1], s=10, c='r')

    sns.scatterplot(adata.obsm['X_umap'][:, 0], adata.obsm['X_umap'][:, 1], color='#1111', s=1, ax=axs[1])
    sns.scatterplot(adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'][:, 0], adata[adata.obs['leiden'] == current_label, :].obsm['X_umap'][:, 1], 
                    hue=adata.obs.loc[adata.obs['leiden'] == current_label, 'is_mix'], palette='Set1', s=1, legend=None, ax=axs[1])
    axs[1].scatter(current_centroid[0, 0], current_centroid[0, 1], s=10, c='r')



In [None]:
adata.obs['is_mix'].value_counts()

In [None]:
adata.obs['level_1'] = adata.obs['level_1'].astype(object)
adata.obs['level_2'] = adata.obs['level_2'].astype(object)
adata.obs['level_3'] = adata.obs['level_3'].astype(object)

adata.obs.loc[adata.obs['is_mix'] == 'True', 'level_1'] = 'Mix'
adata.obs.loc[adata.obs['is_mix'] == 'True', 'level_2'] = 'Mix'
adata.obs.loc[adata.obs['is_mix'] == 'True', 'level_3'] = 'Mix'

### assign category

In [None]:
# Sort category
level_1_order = ['Neuronal cell', 'Glia', 'Mix']
level_2_order = ['Telencephalon projecting neurons', #0
                'Telencephalon interneurons', #1 
                'Cholinergic, monoaminergic and peptidergic neurons', #2
                'Di/Mesencephalon neurons', #3
                'Astrocyte', #4
                'Oligodendrocyte', #5
                'Oligodendrocytes precursor cell', #6
                'Microglia', #7
                'Vascular cells', #8
                'Astroependymal cells', #9
                'Perivascular macrophages', #10   
                 'Mix'
]

adata.obs['level_1'] = adata.obs['level_1'].astype('category')
adata.obs['level_1'].cat.reorder_categories(level_1_order, inplace=True)

adata.obs['level_2'] = adata.obs['level_2'].astype('category')
adata.obs['level_2'].cat.reorder_categories(level_2_order, inplace=True)

In [None]:
# Check color legend
level_1_pl = sns.color_palette(['#e8486d', '#4873e8', '#ebebeb'])
sns.palplot(level_1_pl, size=3)
plt.xticks(range(len(level_1_order)), level_1_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

In [None]:
# Check color legend (old coloring scheme)
level_2_colors = ['#256b00', 
                  '#ee750a', 
                  '#f280cf', 
                  '#f24f4b', 
                  '#e8e879', 
                  '#a8e1eb', 
                  '#667872', 
                  '#23ccb8', 
                  '#395ba8', 
                  '#697491',
                  '#8803fc',
                  '#ebebeb'
                 ]
level_2_pl = sns.color_palette(level_2_colors)
sns.palplot(level_2_pl, size=3)
plt.xticks(range(len(level_2_order)), level_2_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

adata.obs['level_2'] = adata.obs['level_2'].cat.reorder_categories(level_2_order)

In [None]:
# Save plots
# Plot UMAP with cluster labels w/ new color
sc.pl.umap(adata, color='level_1', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 1', palette=level_1_pl, save=False)

fig, ax = plt.subplots(figsize=(10,10))
sc.pl.umap(adata, color='level_1', legend_loc='on data',
           legend_fontsize=8, legend_fontoutline=1, frameon=False, ax=ax,
           title=f'Level 1', palette=level_1_pl, save=False)

In [None]:
# Save plots
# Plot UMAP with cluster labels w/ new color
sc.pl.umap(adata, color='level_2', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 2', palette=level_2_pl, save='_level_2.pdf')

fig, ax = plt.subplots(figsize=(10,10))
sc.pl.umap(adata, color='level_2', legend_loc='on data',
           legend_fontsize=5, legend_fontoutline=1, frameon=False, ax=ax,
           title=f'Level 2', palette=level_2_pl, save=False)

In [None]:
current_sample = 'RIBOmap-rep2'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue='level_2', legend=None,
                    palette=level_2_pl,
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)
g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
plt.savefig(os.path.join(fig_path, f'{current_sample}-level2-spatial-map.png'))

In [None]:
current_sample = 'RIBOmap-rep1'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue='level_2', legend=None,
                    palette=level_2_pl,
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)
g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
plt.savefig(os.path.join(fig_path, f'{current_sample}-level2-spatial-map.png'))

In [None]:
del adata.uns['rank_genes_groups_filtered']

In [None]:
# backup 
adata.write_h5ad(f"{out_path}/{date}-Brain-combined-3mad-ct-bk1.h5ad")

In [None]:
level_2_order

## Level 3 clustering

In [None]:
# Subset
sub_id = 'Microglia'
curr_cells = adata.obs['level_2'] == sub_id
sdata = adata[curr_cells, :].copy()
sdata

In [None]:
%%time
# redo pp (test)
sdata.X = sdata.layers['raw'].copy()
del sdata.layers['scaled']
del sdata.layers['corrected']

sc.pp.normalize_total(sdata)
sc.pp.log1p(sdata)
sdata.raw = sdata
sc.pp.scale(sdata)
sdata.layers['scaled'] = sdata.X.copy()
sc.pp.regress_out(sdata, ['total_counts'])
sdata.layers['corrected'] = sdata.X.copy()

# Run PCA
sdata.X = sdata.layers['corrected'].copy()
sc.tl.pca(sdata, svd_solver='full', use_highly_variable=True)

# Plot explained variance 
sc.pl.pca_variance_ratio(sdata, log=False)

# Plot PCA
sc.pl.pca(sdata, color='protocol-replicate')

import scanpy.external as sce
sce.pp.harmony_integrate(sdata, 'protocol-replicate')

In [None]:
# Embedding parameters
emb_dict = {
    'Di/Mesencephalon neurons': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .4},
    'Telencephalon projecting neurons': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .7},
    'Oligodendrocyte': {'n_neighbors': 50, 'n_pcs': 10, 'min_dist': .1, 'cluster_resolution': .2},
    'Astrocyte': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .5},
    'Vascular cells': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .6},
     'Telencephalon interneurons': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .6},
    'Cholinergic, monoaminergic and peptidergic neurons': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .5},
    'Microglia': {'n_neighbors': 20, 'n_pcs': 5, 'min_dist': .1, 'cluster_resolution': .5},
}

save_embedding = True

In [None]:
if sub_id == 'Di/Mesencephalon neurons':
    sub_level_fig_path = os.path.join(clustering_out_path, "Di_Mesencephalon neurons")
    if not os.path.exists(sub_level_fig_path):
        os.mkdir(sub_level_fig_path)
else:
    sub_level_fig_path = os.path.join(clustering_out_path, sub_id)
    if not os.path.exists(sub_level_fig_path):
        os.mkdir(sub_level_fig_path)

### clustering

In [None]:
# replace regular pca with integrated pca 
sdata.obsm['X_pca'] = sdata.obsm['X_pca_harmony'].copy()
sc.pl.pca_variance_ratio(sdata, log=False)
sc.pl.pca(sdata, color='protocol-replicate')

In [None]:
%%time
# Computing the neighborhood graph
n_neighbors = emb_dict[sub_id]['n_neighbors']
n_pcs = emb_dict[sub_id]['n_pcs']
min_dist = emb_dict[sub_id]['min_dist']

sc.pp.neighbors(sdata, n_neighbors=n_neighbors, n_pcs=n_pcs, random_state=0)

# Run UMAP
min_dist = 0.001
spread = 3
sc.tl.umap(sdata, min_dist=min_dist, spread=spread)

In [None]:
%%time
# Run leiden cluster
cluster_resolution = emb_dict[sub_id]['cluster_resolution']
sc.tl.leiden(sdata, resolution = cluster_resolution)

# Plot UMAP with cluster labels 
sc.pl.umap(sdata, color='leiden')
n_clusters = sdata.obs['leiden'].unique().shape[0]

if save_embedding:
    # Save log
    with open(f'{sub_level_fig_path}/log_{sub_id}.txt', 'w') as f:
        f.write(f"""Number of neighbor: {n_neighbors}
    Number of PC: {n_pcs}
    Resolution: {cluster_resolution}
    Min-distance: {min_dist}
    Number of clusters: {n_clusters}""")

    # save embeddings
    np.savetxt(f'{sub_level_fig_path}/embedding_{sub_id}_umap.csv', sdata.obsm['X_umap'], delimiter=",")

In [None]:
# Get colormap
cluster_pl = sns.color_palette("hls", n_clusters)
cluster_cmap = ListedColormap(cluster_pl.as_hex())
sns.palplot(cluster_pl)

In [None]:
sc.pl.umap(sdata, color='Tmem119')

In [None]:
# Plot UMAP with cluster labels w/ new color
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(sdata, color='leiden', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title='Sub level clustering (leiden)', palette=cluster_pl, save=False, ax=ax)

In [None]:
# move label to level 2 and check
adata.obs['level_3_temp'] = 'NA'
adata.obs.loc[adata.obs['level_2'] == sub_id, 'level_3_temp'] = sdata.obs.leiden.values
adata.obs['level_3_temp'] = adata.obs['level_3_temp'].astype('category')
temp_order = sdata.obs.leiden.cat.categories.to_list()
temp_order.append('NA')
adata.obs['level_3_temp'] = adata.obs['level_3_temp'].cat.reorder_categories(temp_order)
temp_pl = sns.color_palette(sdata.uns['leiden_colors'] + ['#ebebeb'])

In [None]:
# Plot UMAP with all cell embedding
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(adata, color='level_3_temp', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title='Sub level clustering (leiden)', palette=temp_pl, save=False, ax=ax)

In [None]:
# Add log layer
# sdata.layers['log_raw'] = np.log1p(sdata.layers['raw'])
# sc.pp.normalize_total(sdata, layer='log_raw')
# sc.tl.rank_genes_groups(sdata, 'leiden', method='wilcoxon', layer='log_raw', pts=True, use_raw=False, n_genes=adata.shape[1])

# Find gene markers for each cluster
sc.tl.rank_genes_groups(sdata, 'leiden', method='wilcoxon', pts=True, use_raw=True, n_genes=adata.shape[1])

# Filter markers
sc.tl.filter_rank_genes_groups(sdata, min_fold_change=.1, min_in_group_fraction=0.2, max_out_group_fraction=0.8)

In [None]:
current_cell_type = '2'

current_df = sc.get.rank_genes_groups_df(sdata, group=current_cell_type, key='rank_genes_groups')
current_df.head(10)

In [None]:
# Dot plot logfoldchanges
sc.pl.rank_genes_groups_dotplot(sdata, key='rank_genes_groups', n_genes=5, values_to_plot='logfoldchanges', min_logfoldchange=1, vmax=5, vmin=-5, cmap='bwr', dendrogram=False)

In [None]:
# Dot plot mean expression (##)
sc.pl.rank_genes_groups_dotplot(sdata, key='rank_genes_groups_filtered', n_genes=5, dendrogram=False)

In [None]:
# Print markers 
markers = []
temp = pd.DataFrame(sdata.uns['rank_genes_groups_filtered']['names']).head(15)
for i in range(temp.shape[1]):
    curr_col = temp.iloc[:, i].to_list()
    markers = markers + curr_col
    # print(i, curr_col)
    print(i)
    for j in curr_col:
        print(j, end=' ')
    print('')

In [None]:
# # gene_list = ['Slc17a7', 'Gad1', 'Gad2', 'Sst', 'Pvalb', 'Npy', 'Vip', 'Pcp4', 'Cux2', 'Kif5a', 'Slc32a1', 'Nrgn', 'Sncg', 'Rorb', 'Tmsb4x']
# # gene_list = ['Aqp4', 'Gfap', 'Plp1', 'Mbp', 'Mobp', 'Slc1a3', 'Pdgfra', 'Bsg', 'Vtn', 'Vim']
# gene_list = ['Pvalb', 'Sst', 'Vip', 'Npy']

# fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(25, 12))
# axs = axs.flatten()
# for i, gene in enumerate(gene_list):
#     ax = sc.pl.umap(sdata, color=gene, title=gene, ax=axs[i], show=False)
    
# plt.show()

In [None]:
current_sample = 'RIBOmap-rep2'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='leiden', 
                    palette=cluster_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)

g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
current_sample = 'RIBOmap-rep1'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='leiden', 
                    palette=cluster_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)

g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
# plot summary plot for each cluster
sub_level_sum_path = os.path.join(sub_level_fig_path, f'r_{cluster_resolution}_summary_repp')
if not os.path.exists(sub_level_sum_path):
    os.mkdir(sub_level_sum_path)

for i, current_cluster in enumerate(tqdm(sorted(sdata.obs['leiden'].unique()))):
    
    # get dfs 
    df1 = sdata.obs.loc[sdata.obs['protocol-replicate'] == 'RIBOmap-rep1', :]
    df2 = sdata.obs.loc[sdata.obs['protocol-replicate'] == 'RIBOmap-rep2', :]

    fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(12, 16))
    axs = axs.flatten()


    # plot1
    g1 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep1', :], 
                        s=5,
                        ax=axs[0])

    g1.set_title('RIBOmap-rep1')
    g1.invert_xaxis()
    g1.axes.xaxis.set_visible(False)
    g1.axes.yaxis.set_visible(False)


    h1 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df1.loc[df1['leiden'] == current_cluster, ], 
                        s=7,
                        ax=axs[0])

    # plot2
    g2 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep2', :], 
                        s=5,
                        ax=axs[1])

    g2.set_title('RIBOmap-rep2')
    g2.invert_yaxis()
    g2.axes.xaxis.set_visible(False)
    g2.axes.yaxis.set_visible(False)

    h2 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df2.loc[df2['leiden'] == current_cluster, ], 
                        s=7,
                        ax=axs[1])


    size_factor = 200000
    # umap1
    ax = sc.pl.umap(sdata, show=False, color=None, alpha=1, size=(size_factor / sdata.n_obs), ax=axs[2], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(sdata[(sdata.obs["leiden"] == current_cluster) & (sdata.obs['protocol-replicate'] == 'RIBOmap-rep1')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(size_factor / sdata.n_obs),
           title='', show=False, palette=sns.color_palette([sdata.uns['leiden_colors'][int(current_cluster)]]))

    # umap2
    ax = sc.pl.umap(sdata, show=False, color=None, alpha=1, size=(size_factor / sdata.n_obs), ax=axs[3], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(sdata[(sdata.obs["leiden"] == current_cluster) & (sdata.obs['protocol-replicate'] == 'RIBOmap-rep2')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(size_factor / sdata.n_obs),
           title='', show=False, palette=sns.color_palette([sdata.uns['leiden_colors'][int(current_cluster)]]))

    # umap3
    ax = sc.pl.umap(adata, show=False, color=None, alpha=1, size=(size_factor / adata.n_obs), ax=axs[4], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(adata[(adata.obs["level_3_temp"] == current_cluster) & (adata.obs['protocol-replicate'] == 'RIBOmap-rep1')], color='level_3_temp', frameon=False, ax=ax, legend_loc=None, size=(size_factor / adata.n_obs),
           title='', show=False, palette=sns.color_palette([adata.uns['level_3_temp_colors'][int(current_cluster)]]))

    # umap4
    ax = sc.pl.umap(adata, show=False, color=None, alpha=1, size=(size_factor / adata.n_obs), ax=axs[5], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(adata[(adata.obs["level_3_temp"] == current_cluster) & (adata.obs['protocol-replicate'] == 'RIBOmap-rep2')], color='level_3_temp', frameon=False, ax=ax, legend_loc=None, size=(size_factor / adata.n_obs),
           title='', show=False, palette=sns.color_palette([adata.uns['level_3_temp_colors'][int(current_cluster)]]))
    
    plt.savefig(os.path.join(sub_level_sum_path, f'cluster_{current_cluster}.jpeg'))

### assign label

In [None]:
# Plot UMAP with cluster labels w/ new color
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(sdata, color='leiden', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 3 {sub_id}', palette=cluster_pl, save=False, ax=ax)

In [None]:
# Print markers 
markers = []
temp = pd.DataFrame(sdata.uns['rank_genes_groups_filtered']['names']).head(15)
for i in range(temp.shape[1]):
    curr_col = temp.iloc[:, i].to_list()
    markers = markers + curr_col
    # print(i, curr_col)
    print(i)
    for j in curr_col:
        print(j, end=' ')
    print('')

In [None]:
current_cell_type = '6'

current_df = sc.get.rank_genes_groups_df(sdata, group=current_cell_type, key='rank_genes_groups')
current_df.head(10)

In [None]:
# Change cluster label to cell type label
transfer_dict = {}

# Astrocyte
if sub_id == 'Astrocyte':
    level_3_list = ['Astro1', #0
                    'Astro2', #1
                    'Astro3', #2
                    'Astro4', #3
                   ]

# Vascular cells
if sub_id == 'Vascular cells':
    level_3_list = ['Peri/VEC1', #0
                    'Peri/VEC2', #1
                    'VLMC', #2
                    'VSMC', #3
                   ]

# Telencephalon interneurons
if sub_id == 'Telencephalon interneurons':
    level_3_list = ['Inh Sst', #0
                    'Inh Pvalb 1', #1
                    'Inh Pvalb 2', #2
                   ]
    
# Cholinergic, monoaminergic and peptidergic neurons
if sub_id == 'Cholinergic, monoaminergic and peptidergic neurons':
    level_3_list = ['TECHO 1', #0
                    'TECHO 1', #1
                    'TECHO 2', #2
                    'DECHO', #3
                    'HYPEP', #4
                   ]

# Di/Mesencephalon neurons
if sub_id == 'Di/Mesencephalon neurons':
    level_3_list = ['DEGLU 1', #0
                    'DEGLU 2', #1
                   ]
    
# Excitatory neuron
if sub_id == 'Telencephalon projecting neurons':
    level_3_list = ['TEGLU 1', #0
                    'TEGLU 1', #1
                    'TEGLU 1', #2
                    'MSN', #3
                    'DGGRC', #4
                    'TEGLU 2', #5
                    'TEGLU 1', #6
                    'TEGLU 3', #7
                   ]

for i in sorted(sdata.obs['leiden'].unique()):
    transfer_dict[i] = level_3_list[int(i)]

In [None]:
# Print markers 
markers = []
temp = pd.DataFrame(sdata.uns['rank_genes_groups']['names']).head(10)
for i in range(temp.shape[1]):
    curr_col = temp.iloc[:, i].to_list()
    markers = markers + curr_col
    # print(i, curr_col)
    print(f"{i} - {level_3_list[i]}")
    for j in curr_col:
        print(j, end=',')
    print('')

In [None]:
# Assign cell type to sdata
sdata.obs['level_3'] = sdata.obs['leiden'].values
sdata.obs = sdata.obs.replace({'level_3': transfer_dict})

# Sort category
level_3_order = natsorted(list(set(level_3_list)))
sdata.obs['level_3'] = sdata.obs['level_3'].astype('category')
sdata.obs['level_3'].cat.reorder_categories(level_3_order, inplace=True)

In [None]:
# Check color legend
level_3_pl = sns.color_palette("hls", len(level_3_order))
sns.palplot(level_3_pl, size=3)
plt.xticks(range(len(level_3_order)), level_3_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

In [None]:
# Save plots
# Plot UMAP with cluster labels w/ new color
sc.pl.umap(sdata, color='level_3', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 3 {sub_id}', palette=level_3_pl, save=False)

In [None]:
current_sample = 'RIBOmap-rep2'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='level_3', 
                    palette=level_3_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
current_sample = 'RIBOmap-rep1'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='level_3', 
                    palette=level_3_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
# Find gene markers for each cluster
sc.tl.rank_genes_groups(sdata, 'level_3', method='wilcoxon', pts=True, use_raw=True, n_genes=adata.shape[1])

# Filter markers
sc.tl.filter_rank_genes_groups(sdata, min_fold_change=.1, min_in_group_fraction=0.2, max_out_group_fraction=0.8)

In [None]:
# Plot z-score heatmap
sc.pl.rank_genes_groups_dotplot(sdata, n_genes=5, groupby='level_3', min_logfoldchange=1, use_raw=False, swap_axes=True, 
                                vmin=-1, vmax=1, cmap='bwr', dendrogram=False, save=False)

In [None]:
# sc.pl.umap(sdata, color=['Dcn', 'Bsg', 'Flt1', 'Myl9'])
# sc.pl.umap(sdata, color=['Gfap', 'Calm1', 'Shank1', 'Mbp'])
# sc.pl.umap(sdata, color=['Gad1', 'Sst', 'Pvalb', 'Npy', 'Vip'])
# sc.pl.umap(sdata, color=['Scg2', 'Dlk1', 'Gabbr1', 'Cplx2', 'Camkv'])
sc.pl.umap(sdata, color=['Scg2', 'Dlk1', 'Gabbr1', 'Cplx2', 'Camkv'])

### update adata

In [None]:
# Map to original obj
adata.obs['level_3'] = adata.obs['level_3'].astype(object)
adata.obs.loc[sdata.obs.index, 'level_3'] = sdata.obs['level_3'].values
adata.obs['level_3'].unique()

In [None]:
# backup obs
adata.obs.to_csv(f"{out_path}/{date}-Brain-combined-3mad-ct-bk1.csv")

In [None]:
# backup 
adata.write_h5ad(f"{out_path}/{date}-Brain-combined-3mad-ct-bk1.h5ad")

## TEGLU clustering

In [None]:
# Subset
sub_id = 'Telencephalon projecting neurons'
curr_cells = adata.obs['level_2'] == sub_id
sdata = adata[curr_cells, :].copy()
sdata

In [None]:
%%time
# redo pp (test)
sdata.X = sdata.layers['raw'].copy()
del sdata.layers['scaled']
del sdata.layers['corrected']

sc.pp.normalize_total(sdata)
sc.pp.log1p(sdata)
sdata.raw = sdata
sc.pp.scale(sdata)
sdata.layers['scaled'] = sdata.X.copy()
sc.pp.regress_out(sdata, ['total_counts'])
sdata.layers['corrected'] = sdata.X.copy()

# Run PCA
sdata.X = sdata.layers['corrected'].copy()
sc.tl.pca(sdata, svd_solver='full', use_highly_variable=True)

# Plot explained variance 
sc.pl.pca_variance_ratio(sdata, log=False)

# Plot PCA
sc.pl.pca(sdata, color='protocol-replicate')

import scanpy.external as sce
sce.pp.harmony_integrate(sdata, 'protocol-replicate')

In [None]:
# Embedding parameters
emb_dict = {
    'Di/Mesencephalon neurons': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .4},
    'Telencephalon projecting neurons': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .7},
    'Oligodendrocyte': {'n_neighbors': 50, 'n_pcs': 10, 'min_dist': .1, 'cluster_resolution': .2},
    'Astrocyte': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .5},
    'Vascular cells': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .6},
    'Telencephalon interneurons': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .63},
    'Cholinergic, monoaminergic and peptidergic neurons': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .5},
}

save_embedding = True

In [None]:
if sub_id == 'Di/Mesencephalon neurons':
    sub_level_fig_path = os.path.join(clustering_out_path, "Di_Mesencephalon neurons")
    if not os.path.exists(sub_level_fig_path):
        os.mkdir(sub_level_fig_path)
else:
    sub_level_fig_path = os.path.join(clustering_out_path, sub_id)
    if not os.path.exists(sub_level_fig_path):
        os.mkdir(sub_level_fig_path)

### clustering

In [None]:
# replace regular pca with integrated pca 
sdata.obsm['X_pca'] = sdata.obsm['X_pca_harmony'].copy()
sc.pl.pca_variance_ratio(sdata, log=False)
sc.pl.pca(sdata, color='protocol-replicate')

In [None]:
%%time
# Computing the neighborhood graph
n_neighbors = emb_dict[sub_id]['n_neighbors']
n_pcs = emb_dict[sub_id]['n_pcs']
min_dist = emb_dict[sub_id]['min_dist']

sc.pp.neighbors(sdata, n_neighbors=n_neighbors, n_pcs=n_pcs, random_state=0)

# Run UMAP
min_dist = 0.001
spread = 3
sc.tl.umap(sdata, min_dist=min_dist, spread=spread)

In [None]:
%%time
# Run leiden cluster
cluster_resolution = emb_dict[sub_id]['cluster_resolution']
sc.tl.leiden(sdata, resolution = cluster_resolution)

# Plot UMAP with cluster labels 
sc.pl.umap(sdata, color='leiden')
n_clusters = sdata.obs['leiden'].unique().shape[0]

if save_embedding:
    # Save log
    with open(f'{sub_level_fig_path}/log_{sub_id}.txt', 'w') as f:
        f.write(f"""Number of neighbor: {n_neighbors}
    Number of PC: {n_pcs}
    Resolution: {cluster_resolution}
    Min-distance: {min_dist}
    Number of clusters: {n_clusters}""")

    # save embeddings
    np.savetxt(f'{sub_level_fig_path}/embedding_{sub_id}_umap.csv', sdata.obsm['X_umap'], delimiter=",")

In [None]:
# Get colormap
cluster_pl = sns.color_palette("hls", n_clusters)
cluster_cmap = ListedColormap(cluster_pl.as_hex())
sns.palplot(cluster_pl)

In [None]:
# Plot UMAP with cluster labels w/ new color
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(sdata, color='leiden', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title='Sub level clustering (leiden)', palette=cluster_pl, save=False, ax=ax)

In [None]:
# move label to level 2 and check
adata.obs['level_3_temp'] = 'NA'
adata.obs.loc[adata.obs['level_2'] == sub_id, 'level_3_temp'] = sdata.obs.leiden.values
adata.obs['level_3_temp'] = adata.obs['level_3_temp'].astype('category')
temp_order = sdata.obs.leiden.cat.categories.to_list()
temp_order.append('NA')
adata.obs['level_3_temp'] = adata.obs['level_3_temp'].cat.reorder_categories(temp_order)
temp_pl = sns.color_palette(sdata.uns['leiden_colors'] + ['#ebebeb'])

In [None]:
# Plot UMAP with all cell embedding
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(adata, color='level_3_temp', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title='Sub level clustering (leiden)', palette=temp_pl, save=False, ax=ax)

### assign label

In [None]:
# Plot UMAP with cluster labels w/ new color
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(sdata, color='leiden', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 3 {sub_id}', palette=cluster_pl, save=False, ax=ax)

In [None]:
# Change cluster label to cell type label
transfer_dict = {}

# Excitatory neuron
if sub_id == 'Telencephalon projecting neurons':
    level_3_list = ['TEGLU 1', #0
                    'TEGLU 1', #1
                    'TEGLU 1', #2
                    'MSN', #3
                    'DGGRC', #4
                    'TEGLU 2', #5
                    'TEGLU 1', #6
                    'TEGLU 3', #7
                   ]

for i in sorted(sdata.obs['leiden'].unique()):
    transfer_dict[i] = level_3_list[int(i)]

In [None]:
# Assign cell type to sdata
sdata.obs['level_3'] = sdata.obs['leiden'].values
sdata.obs = sdata.obs.replace({'level_3': transfer_dict})

# Sort category
level_3_order = natsorted(list(set(level_3_list)))
sdata.obs['level_3'] = sdata.obs['level_3'].astype('category')
sdata.obs['level_3'].cat.reorder_categories(level_3_order, inplace=True)

In [None]:
# Check color legend
level_3_pl = sns.color_palette("hls", len(level_3_order))
sns.palplot(level_3_pl, size=3)
plt.xticks(range(len(level_3_order)), level_3_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

In [None]:
# Save plots
# Plot UMAP with cluster labels w/ new color
sc.pl.umap(sdata, color='level_3', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 3 {sub_id}', palette=level_3_pl, save=False)

In [None]:
current_sample = 'RIBOmap-rep2'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='level_3', 
                    palette=level_3_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
current_sample = 'RIBOmap-rep1'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='level_3', 
                    palette=level_3_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

### update adata

In [None]:
# Map to original obj
adata.obs['level_3'] = adata.obs['level_3'].astype(object)
adata.obs.loc[sdata.obs.index, 'level_3'] = sdata.obs['level_3'].values
adata.obs['level_3'].unique()

In [None]:
adata.obs['level_3'].unique().shape

### classify mix cells

In [None]:
# calculate the distance between each cell and its leiden cluster center
sdata.obs['distance2centroid_leiden_teglu'] = 9999

# find centroid of each leiden cluster
from sklearn.neighbors import NearestCentroid
from scipy.spatial.distance import cdist
clf = NearestCentroid(shrink_threshold=None)
clf.fit(sdata.obsm['X_umap'], sdata.obs.level_3)

# plot cluster center
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(sdata, color='level_3', ax=ax, show=False, legend_loc='on data')
ax.scatter(clf.centroids_[:, 0], clf.centroids_[:, 1], s=10, c='r')
plt.show()

for i, current_label in enumerate(sorted(sdata.obs['level_3'].cat.categories)):
    current_centroid = clf.centroids_[i, :].reshape([1,2])
    
    # check centroid location 
    # fig, ax = plt.subplots(figsize=(10,7))
    # sc.pl.umap(adata, color='leiden', ax=ax, show=False, legend_loc='on data', title=f'cluster-{current_label}')
    # ax.scatter(current_centroid[:, 0], current_centroid[:, 1], s=10, c='r')
    # plt.show()

    # calculate distance 
    dm = cdist(sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'], current_centroid)
    sdata.obs.loc[sdata.obs['level_3'] == current_label, 'distance2centroid_leiden_teglu'] = dm

    # plot distance distribution
    fig, ax = plt.subplots(figsize=(5,3))
    sns.histplot(dm)
    plt.title(f'cluster-{current_label}')
    plt.show()
    
    # visualize distance calculation 
    fig, axs = plt.subplots(figsize=(20,7), nrows=1, ncols=2)
    sns.scatterplot(sdata.obsm['X_umap'][:, 0], sdata.obsm['X_umap'][:, 1], color='#1111', s=1, ax=axs[0])
    sns.scatterplot(sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'][:, 0], sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'][:, 1], s=1, legend=None, ax=axs[0])
    axs[0].scatter(current_centroid[:, 0], current_centroid[:, 1], s=10, c='r')

    sns.scatterplot(sdata.obsm['X_umap'][:, 0], sdata.obsm['X_umap'][:, 1], color='#1111', s=1, ax=axs[1])
    sns.scatterplot(sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'][:, 0], sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'][:, 1], 
                    hue=sdata.obs.loc[sdata.obs['level_3'] == current_label, 'distance2centroid_leiden_teglu'], palette='viridis', s=1, legend=None, ax=axs[1])
    axs[1].scatter(current_centroid[:, 0], current_centroid[:, 1], s=10, c='r')
    plt.show()

    

In [None]:
# classify mix cells

# leiden
# manual_threshold = [
#     6, # 0
#     6, # 1
#     6.5, # 2
#     4, # 3
#     3, # 4
#     3, # 5
#     2, # 6
#     2, # 7
# ]

manual_threshold = [
    2.4, # DGGRC
    4, # MSN
    10, # TEGLU1
    2.5, # TEGLU2
    2, # TEGLU3
]

sdata.obs['is_mix_teglu'] = 'False'

# plot overall distance distribution
sns.distplot(sdata.obs['distance2centroid_leiden_teglu'])

for i, current_label in enumerate(sorted(sdata.obs['level_3'].cat.categories)):
    current_centroid = clf.centroids_[i, :].reshape([1,2])
    
    # plot distance distribution of current cluster
    fig, ax = plt.subplots(figsize=(5,3))
    sns.histplot(sdata.obs.loc[sdata.obs['level_3'] == current_label, 'distance2centroid_leiden_teglu'])
    ax.axvline(x=manual_threshold[i], c='r')
    plt.title(f'cluster-{current_label}')
    plt.show()
    
    sdata.obs.loc[(sdata.obs['level_3'] == current_label) & (sdata.obs['distance2centroid_leiden_teglu'] > manual_threshold[i]), 'is_mix_teglu'] = 'True'
    sdata.obs['is_mix_teglu'] = sdata.obs['is_mix_teglu'].astype('category')
    
    fig, axs = plt.subplots(figsize=(20,7), nrows=1, ncols=2)
    sns.scatterplot(sdata.obsm['X_umap'][:, 0], sdata.obsm['X_umap'][:, 1], color='#1111', s=1, ax=axs[0])
    sns.scatterplot(sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'][:, 0], sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'][:, 1], s=1, legend=None, ax=axs[0])
    axs[0].scatter(current_centroid[0, 0], current_centroid[0, 1], s=10, c='r')

    sns.scatterplot(sdata.obsm['X_umap'][:, 0], sdata.obsm['X_umap'][:, 1], color='#1111', s=1, ax=axs[1])
    sns.scatterplot(sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'][:, 0], sdata[sdata.obs['level_3'] == current_label, :].obsm['X_umap'][:, 1], 
                    hue=sdata.obs.loc[sdata.obs['level_3'] == current_label, 'is_mix_teglu'], palette='Set1', s=1, legend=None, ax=axs[1])
    axs[1].scatter(current_centroid[0, 0], current_centroid[0, 1], s=10, c='r')



In [None]:
sdata.obs['is_mix_teglu'].value_counts()

In [None]:
# reclassify mix cells 
ref_cells = sdata.obs.loc[sdata.obs['is_mix_teglu'] == 'False', :].index
mix_cells = sdata.obs.loc[sdata.obs['is_mix_teglu'] == 'True', :].index

ref_cell_loc = sdata[ref_cells, :].obsm['X_umap']
mix_cell_loc = sdata[mix_cells, :].obsm['X_umap']

ref_cell_annot = sdata.obs.loc[ref_cells, 'level_3'].values
mix_cell_orig = sdata.obs.loc[mix_cells, 'level_3'].values

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=20)
neigh.fit(ref_cell_loc, ref_cell_annot)
mix_cell_predicted = neigh.predict(mix_cell_loc)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
a = pd.crosstab(mix_cell_predicted, mix_cell_orig)
sns.heatmap(a, annot=True, fmt='d', ax=ax)
plt.xlabel('predicted-label')
plt.ylabel('leiden-label')
plt.show()

In [None]:
# update to adata
adata.obs.loc[mix_cells, 'level_3'] = mix_cell_predicted

In [None]:
# backup obs
adata.obs.to_csv(f"{out_path}/{date}-Brain-combined-3mad-ct-bk1.csv")

In [None]:
# backup 
adata.write_h5ad(f"{out_path}/{date}-Brain-combined-3mad-ct-bk1.h5ad")

In [None]:
# Save plots
# Plot UMAP with cluster labels w/ new color
sc.pl.umap(sdata, color='level_3', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 3 {sub_id}', palette=level_3_pl, save=False)

In [None]:
sdata.obs.loc[mix_cells, 'level_3'] = mix_cell_predicted

current_sample = 'RIBOmap-rep1'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='level_3', 
                    palette=level_3_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
# sns.scatterplot('umap_x', 'umap_y', data=sdata.obs)
# sns.scatterplot('umap_x', 'umap_y', data=sdata.obs.loc[sdata.obs['orig_index'] == '19530', :])

In [None]:
# sdata.obs['umap_x'] = sdata.obsm['X_umap'][:, 0]
# sdata.obs['umap_y'] = sdata.obsm['X_umap'][:, 1]

# sdata.obs.loc[sdata.obs['orig_index'] == '19072', :]

In [None]:
# import plotly.express as px
# fig = px.scatter(sdata.obs.loc[sdata.obs['protocol-replicate'] == 'RIBOmap-rep2', :], x="column", y="row", color="level_3",
#                 hover_data=['orig_index'])
# fig.update_layout(
#     autosize=False,
#     width=1500,
#     height=1500,)

# fig.show()

## Level 4 clustering (TEGLU)

TEGLU1, TEGLU2, MSN, TEGLU CA3

In [None]:
# Subset
sub_id = 'TEGLU 1'
curr_cells = adata.obs['level_3'] == sub_id
sdata = adata[curr_cells, :].copy()
sdata

In [None]:
%%time
# redo pp (test)
sdata.X = sdata.layers['raw'].copy()
del sdata.layers['scaled']
del sdata.layers['corrected']

sc.pp.normalize_total(sdata)
sc.pp.log1p(sdata)
sdata.raw = sdata
sc.pp.scale(sdata)
sdata.layers['scaled'] = sdata.X.copy()
sc.pp.regress_out(sdata, ['total_counts'])
sdata.layers['corrected'] = sdata.X.copy()

# Run PCA
sdata.X = sdata.layers['corrected'].copy()
sc.tl.pca(sdata, svd_solver='full', use_highly_variable=True)

# Plot explained variance 
sc.pl.pca_variance_ratio(sdata, log=False)

# Plot PCA
sc.pl.pca(sdata, color='protocol-replicate')

import scanpy.external as sce
sce.pp.harmony_integrate(sdata, 'protocol-replicate')

In [None]:
# Embedding parameters
emb_dict = {
    'TEGLU 1': {'n_neighbors': 20, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': 1.1},
    'TEGLU 2': {'n_neighbors': 50, 'n_pcs': 30, 'min_dist': .1, 'cluster_resolution': .3},
    'MSN': {'n_neighbors': 50, 'n_pcs': 10, 'min_dist': .1, 'cluster_resolution': .3},
    'TEGLU CA3': {'n_neighbors': 50, 'n_pcs': 15, 'min_dist': .1, 'cluster_resolution': .4},
}

save_embedding = True

In [None]:
if sub_id == 'Di/Mesencephalon neurons':
    sub_level_fig_path = os.path.join(clustering_out_path, "Di_Mesencephalon neurons")
    if not os.path.exists(sub_level_fig_path):
        os.mkdir(sub_level_fig_path)
else:
    sub_level_fig_path = os.path.join(clustering_out_path, sub_id)
    if not os.path.exists(sub_level_fig_path):
        os.mkdir(sub_level_fig_path)

### clustering

In [None]:
# replace regular pca with integrated pca 
sdata.obsm['X_pca'] = sdata.obsm['X_pca_harmony'].copy()
sc.pl.pca_variance_ratio(sdata, log=False)
sc.pl.pca(sdata, color='protocol-replicate')

In [None]:
%%time
# Computing the neighborhood graph
n_neighbors = emb_dict[sub_id]['n_neighbors']
n_pcs = emb_dict[sub_id]['n_pcs']
min_dist = emb_dict[sub_id]['min_dist']

sc.pp.neighbors(sdata, n_neighbors=n_neighbors, n_pcs=n_pcs, random_state=0)

# Run UMAP
min_dist = 0.001
spread = 3
sc.tl.umap(sdata, min_dist=min_dist, spread=spread)

In [None]:
%%time
# Run leiden cluster
cluster_resolution = emb_dict[sub_id]['cluster_resolution']
sc.tl.leiden(sdata, resolution = cluster_resolution)

# Plot UMAP with cluster labels 
sc.pl.umap(sdata, color='leiden')
n_clusters = sdata.obs['leiden'].unique().shape[0]

if save_embedding:
    # Save log
    with open(f'{sub_level_fig_path}/log_{sub_id}.txt', 'w') as f:
        f.write(f"""Number of neighbor: {n_neighbors}
    Number of PC: {n_pcs}
    Resolution: {cluster_resolution}
    Min-distance: {min_dist}
    Number of clusters: {n_clusters}""")

    # save embeddings
    np.savetxt(f'{sub_level_fig_path}/embedding_{sub_id}_umap.csv', sdata.obsm['X_umap'], delimiter=",")

In [None]:
# Get colormap
cluster_pl = sns.color_palette("hls", n_clusters)
cluster_cmap = ListedColormap(cluster_pl.as_hex())
sns.palplot(cluster_pl)

In [None]:
# Plot UMAP with cluster labels w/ new color
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(sdata, color='leiden', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title='Sub level clustering (leiden)', palette=cluster_pl, save=False, ax=ax)

In [None]:
# move label to level 2 and check
adata.obs['level_3_temp'] = 'NA'
adata.obs.loc[adata.obs['level_3'] == sub_id, 'level_3_temp'] = sdata.obs.leiden.values
adata.obs['level_3_temp'] = adata.obs['level_3_temp'].astype('category')
temp_order = sdata.obs.leiden.cat.categories.to_list()
temp_order.append('NA')
adata.obs['level_3_temp'] = adata.obs['level_3_temp'].cat.reorder_categories(temp_order)
temp_pl = sns.color_palette(sdata.uns['leiden_colors'] + ['#ebebeb'])

In [None]:
# Plot UMAP with all cell embedding
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(adata, color='level_3_temp', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title='Sub level clustering (leiden)', palette=temp_pl, save=False, ax=ax)

In [None]:
# Add log layer
# sdata.layers['log_raw'] = np.log1p(sdata.layers['raw'])
# sc.pp.normalize_total(sdata, layer='log_raw')
# sc.tl.rank_genes_groups(sdata, 'leiden', method='wilcoxon', layer='log_raw', pts=True, use_raw=False, n_genes=adata.shape[1])

# Find gene markers for each cluster
sc.tl.rank_genes_groups(sdata, 'leiden', method='wilcoxon', pts=True, use_raw=True, n_genes=adata.shape[1])

# Filter markers
sc.tl.filter_rank_genes_groups(sdata, min_fold_change=.1, min_in_group_fraction=0.2, max_out_group_fraction=0.8)

In [None]:
current_cell_type = '8'

current_df = sc.get.rank_genes_groups_df(sdata, group=current_cell_type, key='rank_genes_groups')
current_df.head(10)

In [None]:
# Dot plot logfoldchanges
sc.pl.rank_genes_groups_dotplot(sdata, key='rank_genes_groups', n_genes=5, values_to_plot='logfoldchanges', min_logfoldchange=1, vmax=5, vmin=-5, cmap='bwr', dendrogram=False)

In [None]:
# Dot plot mean expression (##)
sc.pl.rank_genes_groups_dotplot(sdata, key='rank_genes_groups_filtered', n_genes=5, dendrogram=False)

In [None]:
# Print markers 
markers = []
temp = pd.DataFrame(sdata.uns['rank_genes_groups_filtered']['names']).head(15)
for i in range(temp.shape[1]):
    curr_col = temp.iloc[:, i].to_list()
    markers = markers + curr_col
    # print(i, curr_col)
    print(i)
    for j in curr_col:
        print(j, end=' ')
    print('')

In [None]:
# # gene_list = ['Slc17a7', 'Gad1', 'Gad2', 'Sst', 'Pvalb', 'Npy', 'Vip', 'Pcp4', 'Cux2', 'Kif5a', 'Slc32a1', 'Nrgn', 'Sncg', 'Rorb', 'Tmsb4x']
# # gene_list = ['Aqp4', 'Gfap', 'Plp1', 'Mbp', 'Mobp', 'Slc1a3', 'Pdgfra', 'Bsg', 'Vtn', 'Vim']
# gene_list = ['Pvalb', 'Sst', 'Vip', 'Npy']

# fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(25, 12))
# axs = axs.flatten()
# for i, gene in enumerate(gene_list):
#     ax = sc.pl.umap(sdata, color=gene, title=gene, ax=axs[i], show=False)
    
# plt.show()

In [None]:
current_sample = 'RIBOmap-rep2'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='leiden', 
                    palette=cluster_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)

g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
current_sample = 'RIBOmap-rep1'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='leiden', 
                    palette=cluster_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)

g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
# plot summary plot for each cluster
sub_level_sum_path = os.path.join(sub_level_fig_path, f'r_{cluster_resolution}_summary_repp')
if not os.path.exists(sub_level_sum_path):
    os.mkdir(sub_level_sum_path)

for i, current_cluster in enumerate(tqdm(sorted(sdata.obs['leiden'].unique()))):
    
    # get dfs 
    df1 = sdata.obs.loc[sdata.obs['protocol-replicate'] == 'RIBOmap-rep1', :]
    df2 = sdata.obs.loc[sdata.obs['protocol-replicate'] == 'RIBOmap-rep2', :]

    fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(12, 16))
    axs = axs.flatten()


    # plot1
    g1 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep1', :], 
                        s=5,
                        ax=axs[0])

    g1.set_title('RIBOmap-rep1')
    g1.invert_xaxis()
    g1.axes.xaxis.set_visible(False)
    g1.axes.yaxis.set_visible(False)


    h1 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df1.loc[df1['leiden'] == current_cluster, ], 
                        s=7,
                        ax=axs[0])

    # plot2
    g2 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep2', :], 
                        s=5,
                        ax=axs[1])

    g2.set_title('RIBOmap-rep2')
    g2.invert_yaxis()
    g2.axes.xaxis.set_visible(False)
    g2.axes.yaxis.set_visible(False)

    h2 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df2.loc[df2['leiden'] == current_cluster, ], 
                        s=7,
                        ax=axs[1])


    size_factor = 200000
    # umap1
    ax = sc.pl.umap(sdata, show=False, color=None, alpha=1, size=(size_factor / sdata.n_obs), ax=axs[2], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(sdata[(sdata.obs["leiden"] == current_cluster) & (sdata.obs['protocol-replicate'] == 'RIBOmap-rep1')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(size_factor / sdata.n_obs),
           title='', show=False, palette=sns.color_palette([sdata.uns['leiden_colors'][int(current_cluster)]]))

    # umap2
    ax = sc.pl.umap(sdata, show=False, color=None, alpha=1, size=(size_factor / sdata.n_obs), ax=axs[3], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(sdata[(sdata.obs["leiden"] == current_cluster) & (sdata.obs['protocol-replicate'] == 'RIBOmap-rep2')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(size_factor / sdata.n_obs),
           title='', show=False, palette=sns.color_palette([sdata.uns['leiden_colors'][int(current_cluster)]]))

    # umap3
    ax = sc.pl.umap(adata, show=False, color=None, alpha=1, size=(size_factor / adata.n_obs), ax=axs[4], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(adata[(adata.obs["level_3_temp"] == current_cluster) & (adata.obs['protocol-replicate'] == 'RIBOmap-rep1')], color='level_3_temp', frameon=False, ax=ax, legend_loc=None, size=(size_factor / adata.n_obs),
           title='', show=False, palette=sns.color_palette([adata.uns['level_3_temp_colors'][int(current_cluster)]]))

    # umap4
    ax = sc.pl.umap(adata, show=False, color=None, alpha=1, size=(size_factor / adata.n_obs), ax=axs[5], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(adata[(adata.obs["level_3_temp"] == current_cluster) & (adata.obs['protocol-replicate'] == 'RIBOmap-rep2')], color='level_3_temp', frameon=False, ax=ax, legend_loc=None, size=(size_factor / adata.n_obs),
           title='', show=False, palette=sns.color_palette([adata.uns['level_3_temp_colors'][int(current_cluster)]]))
    
    plt.savefig(os.path.join(sub_level_sum_path, f'cluster_{current_cluster}.jpeg'))

### assign label

In [None]:
# Plot UMAP with cluster labels w/ new color
fig, ax = plt.subplots(figsize=(10,7))
sc.pl.umap(sdata, color='leiden', legend_loc='on data',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 3 {sub_id}', palette=cluster_pl, save=False, ax=ax)

In [None]:
# Print markers 
markers = []
temp = pd.DataFrame(sdata.uns['rank_genes_groups_filtered']['names']).head(15)
for i in range(temp.shape[1]):
    curr_col = temp.iloc[:, i].to_list()
    markers = markers + curr_col
    # print(i, curr_col)
    print(i)
    for j in curr_col:
        print(j, end=' ')
    print('')

In [None]:
current_cell_type = '6'

current_df = sc.get.rank_genes_groups_df(sdata, group=current_cell_type, key='rank_genes_groups')
current_df.head(10)

In [None]:
# Change cluster label to cell type label
transfer_dict = {}

# TEGLU 1
if sub_id == 'TEGLU 1':
    level_3_list = ['TEGLU L6a', #0
                    'TEGLU L2/3', #1
                    'TEGLU L5', #2
                    'TEGLU L1/2/3', #3
                    'TEGLU L2/3', #4
                    'TEGLU PIR', #5
                    'TEGLU L6', #6
                    'TEGLU Mix', #7
                    'TEGLU Unknown', #8
                   ]

# TEGLU 2
if sub_id == 'TEGLU 2':
    level_3_list = ['TEGLU CA1', #0
                    'TEGLU CA3', #1
                   ]
    
# TEGLU CA3
if sub_id == 'TEGLU CA3':
    level_3_list = ['TEGLU CA3', #0
                    'TEGLU CA2', #1
                    'TEGLU CA3', #1
                   ]
    
for i in sorted(sdata.obs['leiden'].unique()):
    transfer_dict[i] = level_3_list[int(i)]

In [None]:
# Print markers 
markers = []
temp = pd.DataFrame(sdata.uns['rank_genes_groups']['names']).head(10)
for i in range(temp.shape[1]):
    curr_col = temp.iloc[:, i].to_list()
    markers = markers + curr_col
    # print(i, curr_col)
    print(f"{i} - {level_3_list[i]}")
    for j in curr_col:
        print(j, end=',')
    print('')

In [None]:
# Assign cell type to sdata
sdata.obs['level_3'] = sdata.obs['leiden'].values
sdata.obs = sdata.obs.replace({'level_3': transfer_dict})

# Sort category
level_3_order = natsorted(list(set(level_3_list)))
sdata.obs['level_3'] = sdata.obs['level_3'].astype('category')
sdata.obs['level_3'].cat.reorder_categories(level_3_order, inplace=True)

In [None]:
# Check color legend
level_3_pl = sns.color_palette("hls", len(level_3_order))
sns.palplot(level_3_pl, size=3)
plt.xticks(range(len(level_3_order)), level_3_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

In [None]:
# Save plots
# Plot UMAP with cluster labels w/ new color
sc.pl.umap(sdata, color='level_3', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 3 {sub_id}', palette=level_3_pl, save=False)

In [None]:
current_sample = 'RIBOmap-rep2'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='level_3', 
                    palette=level_3_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
current_sample = 'RIBOmap-rep1'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', color='#ebebeb', 
                    data=adata.obs.loc[adata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g = sns.scatterplot(x='column', y='row', hue='level_3', 
                    palette=level_3_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)

g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)

In [None]:
# Find gene markers for each cluster
sc.tl.rank_genes_groups(sdata, 'level_3', method='wilcoxon', pts=True, use_raw=True, n_genes=adata.shape[1])

# Filter markers
sc.tl.filter_rank_genes_groups(sdata, min_fold_change=.1, min_in_group_fraction=0.2, max_out_group_fraction=0.8)

In [None]:
# Plot z-score heatmap
sc.pl.rank_genes_groups_dotplot(sdata, n_genes=5, groupby='level_3', min_logfoldchange=1, use_raw=False, swap_axes=True, 
                                vmin=-1, vmax=1, cmap='bwr', dendrogram=False, save=False)

In [None]:
# sc.pl.umap(sdata, color=['Dcn', 'Bsg', 'Flt1', 'Myl9'])
# sc.pl.umap(sdata, color=['Gfap', 'Calm1', 'Shank1', 'Mbp'])
# sc.pl.umap(sdata, color=['Gad1', 'Sst', 'Pvalb', 'Npy', 'Vip'])
# sc.pl.umap(sdata, color=['Scg2', 'Dlk1', 'Gabbr1', 'Cplx2', 'Camkv'])
sc.pl.umap(sdata, color=['Scg2', 'Dlk1', 'Gabbr1', 'Cplx2', 'Camkv'])

### update adata

In [None]:
# Map to original obj
adata.obs['level_3'] = adata.obs['level_3'].astype(object)
adata.obs.loc[sdata.obs.index, 'level_3'] = sdata.obs['level_3'].values
adata.obs['level_3'].unique()

In [None]:
adata.obs['level_3'].unique().shape

In [None]:
# backup obs
adata.obs.to_csv(f"{out_path}/{date}-Brain-combined-3mad-ct-bk2.csv")

In [None]:
# backup 
adata.write_h5ad(f"{out_path}/{date}-Brain-combined-3mad-ct-bk3.h5ad")

In [None]:
sc.pl.umap(adata, color='level_3')

In [None]:
adata.obs['level_3'].unique().shape

In [None]:
adata.obs['level_3'].unique()

## Reclassify mix cells

In [None]:
adata.obs.loc[adata.obs['level_1'] == 'Mix', :].shape

In [None]:
adata.obs.loc[adata.obs['level_2'] == 'Mix', :].shape

In [None]:
adata.obs.loc[adata.obs['level_3'] == 'Mix', :].shape

In [None]:
# reclassify mix cells 
ref_cells = adata.obs.loc[adata.obs['level_1'] != 'Mix', :].index
mix_cells = adata.obs.loc[adata.obs['level_1'] == 'Mix', :].index

ref_cell_loc = adata[ref_cells, :].obsm['X_umap']
mix_cell_loc = adata[mix_cells, :].obsm['X_umap']

In [None]:
current_level = 'level_3'
ref_cell_annot = adata.obs.loc[ref_cells, current_level].values
mix_cell_orig = adata.obs.loc[mix_cells, current_level].values

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=50)
neigh.fit(ref_cell_loc, ref_cell_annot)
mix_cell_predicted = neigh.predict(mix_cell_loc)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
a = pd.crosstab(mix_cell_predicted, mix_cell_orig)
sns.heatmap(a, annot=True, fmt='d', ax=ax)
plt.xlabel('original-label')
plt.ylabel('predicted-label')
plt.show()

In [None]:
# update to adata
adata.obs.loc[mix_cells, 'level_3_predicted'] = mix_cell_predicted

In [None]:
sc.pl.umap(adata, color='level_3_predicted')

In [None]:
sc.pl.umap(adata, color='Sst')

In [None]:
sc.pl.umap(adata, color='level_3')

In [None]:
# backup obs
adata.obs.to_csv(f"{out_path}/{date}-Brain-combined-3mad-ct-v1.csv")

In [None]:
# backup 
adata.write_h5ad(f"{out_path}/{date}-Brain-combined-3mad-ct-v1.h5ad")

## Generate files for cell type verification

In [None]:
# generate files for the current level of annotations 
current_level = 'level_3'
n_clusters = adata.obs[current_level].unique().shape[0]

# Get markers for each cluster
sc.tl.rank_genes_groups(adata, current_level, method='wilcoxon', pts=True)
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.15, max_out_group_fraction=0.85)

In [None]:
# generate folders 
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
clustering_out_path = os.path.join(fig_path, f'{date}-clustering')
if not os.path.exists(clustering_out_path):
    os.mkdir(clustering_out_path)
    
level_out_path = os.path.join(clustering_out_path, current_level)
if not os.path.exists(level_out_path):
    os.mkdir(level_out_path)

In [None]:
# current_cell_type = 'Astrocyte'

# current_df = sc.get.rank_genes_groups_df(adata, group=current_cell_type, key='rank_genes_groups_filtered')
# current_df.head(10)

In [None]:
# gene_markers_summary 
cell_types = []
markers = []
temp = pd.DataFrame(adata.uns['rank_genes_groups_filtered']['names']).head(20)
for i in range(temp.shape[1]):
# for i in range(2):
    curr_col = temp.iloc[:, i].to_list()
    
    curr_col = np.array(curr_col)
    curr_col = [j for j in curr_col if j != 'nan']
    current_type = adata.obs[current_level].cat.categories.to_list()[i]
    
    cell_types.append(current_type)
    markers.append(curr_col)

gene_markers_summary_filtered = pd.DataFrame({'cell type': cell_types, 'markers': markers})
gene_markers_summary_filtered = gene_markers_summary_filtered.loc[gene_markers_summary_filtered['cell type'] != 'Mix', :]

In [None]:
# gene_markers_summary
cell_types = []
markers = []
temp = pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(20)
for i in range(temp.shape[1]):
# for i in range(2):
    curr_col = temp.iloc[:, i].to_list()
    
    curr_col = np.array(curr_col)
    curr_col = [j for j in curr_col if j != 'nan']
    current_type = adata.obs[current_level].cat.categories.to_list()[i]
    
    cell_types.append(current_type)
    markers.append(curr_col)

gene_markers_summary = pd.DataFrame({'cell type': cell_types, 'markers': markers})
gene_markers_summary = gene_markers_summary.loc[gene_markers_summary['cell type'] != 'Mix', :]

In [None]:
# save gene markers 
gene_markers_summary.to_csv(os.path.join(level_out_path, f'gene_markers_summary.csv'))
gene_markers_summary_filtered.to_csv(os.path.join(level_out_path, f'gene_markers_summary_filtered.csv'))

with pd.ExcelWriter(os.path.join(level_out_path, f'gene_markers_{current_level}.xlsx'), mode='w') as writer:  
    for current_cell_type in adata.obs[current_level].cat.categories:
        current_df = sc.get.rank_genes_groups_df(adata, group=current_cell_type, key='rank_genes_groups')
        
        if '/' in current_cell_type:
            current_cell_type = current_cell_type.replace('/', '_')
            current_df.to_excel(writer, sheet_name=f'{current_cell_type}')
        else:
            current_df.to_excel(writer, sheet_name=f'{current_cell_type}')
        
with pd.ExcelWriter(os.path.join(level_out_path, f'gene_markers_{current_level}_filtered.xlsx'), mode='w') as writer:  
    for current_cell_type in adata.obs[current_level].cat.categories:
        current_df = sc.get.rank_genes_groups_df(adata, group=current_cell_type, key='rank_genes_groups_filtered')
        
        if '/' in current_cell_type:
            current_cell_type = current_cell_type.replace('/', '_')
            current_df.to_excel(writer, sheet_name=f'{current_cell_type}')
        else:
            current_df.to_excel(writer, sheet_name=f'{current_cell_type}')
        
# save obs
adata.obs.to_csv(os.path.join(level_out_path, f'obs_backup.csv'))

In [None]:
# Dot plot logfoldchanges
sc.settings.figdir = level_out_path

sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups', n_genes=10, min_logfoldchange=1, dendrogram=False, save=f'{current_level}_expr.pdf')
sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups_filtered', n_genes=10, min_logfoldchange=1, dendrogram=False, save=f'{current_level}_expr_filtered.pdf')

In [None]:
# generate spatial maps 

current_sm_path = os.path.join(level_out_path, 'spatial-maps')
if not os.path.exists(current_sm_path):
    os.mkdir(current_sm_path)

for i, current_cluster in enumerate(tqdm(sorted(adata.obs[current_level].unique()))):
    
    # get dfs 
    df1 = adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep1', :]
    df2 = adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep2', :]

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(40, 20))
    axs = axs.flatten()


    # plot1
    g1 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep1', :], 
                        s=10,
                        ax=axs[0])

    g1.set_title('RIBOmap-rep1')
    g1.invert_xaxis()
    g1.axes.xaxis.set_visible(False)
    g1.axes.yaxis.set_visible(False)


    h1 = sns.scatterplot(x='column', y='row', legend=None, # hue=current_level,
                        color='#d93321',
                        data=df1.loc[df1[current_level] == current_cluster, ], 
                        s=10,
                        ax=axs[0])

    # plot2
    g2 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=adata.obs.loc[adata.obs['protocol-replicate'] == 'RIBOmap-rep2', :], 
                        s=10,
                        ax=axs[1])

    g2.set_title('RIBOmap-rep2')
    g2.invert_yaxis()
    g2.axes.xaxis.set_visible(False)
    g2.axes.yaxis.set_visible(False)

    h2 = sns.scatterplot(x='column', y='row', legend=None, # hue=current_level, 
                        color='#d93321',
                        data=df2.loc[df2[current_level] == current_cluster, ], 
                        s=10,
                        ax=axs[1])

    plt.tight_layout()
    if '/' in current_cluster:
        current_cluster = current_cluster.replace('/', '_')
        plt.savefig(os.path.join(current_sm_path, f'cluster_{current_cluster}.jpeg'))
    else:
        plt.savefig(os.path.join(current_sm_path, f'cluster_{current_cluster}.jpeg'))
    

## modify labels

In [None]:
# laod adata
# adata = sc.read_h5ad(os.path.join(out_path, '2022-11-14-Brain-combined-3mad-ct-final.h5ad'))
adata = sc.read_h5ad(os.path.join(out_path, '2022-11-13-Brain-RIBOmap-3mad-ct-final.h5ad'))
adata

In [None]:
for i in adata.obs['level_3'].cat.categories:
    print(i)

In [None]:
# adata.obs = adata.obs.drop(columns=['level_3_temp', 'level_3_predicted'])
adata.obs.level_3 = adata.obs.level_3.astype(object)

# # change TEGLU Unknown to TEGLU mix
# adata.obs.loc[adata.obs['level_3'] == 'TEGLU Unknown', 'level_3'] = 'TEGLU Mix'

# # change TEGLU L5 to TEGLU L4/5
# adata.obs.loc[adata.obs['level_3'] == 'TEGLU L5', 'level_3'] = 'TEGLU L4/5'

# # change TEGLU 3 to TEGLU COA
# adata.obs.loc[adata.obs['level_3'] == 'TEGLU 3', 'level_3'] = 'TEGLU COA'

# change TEGLU 2/3 to TEGLU 2/3/4
adata.obs.loc[adata.obs['level_3'] == 'TEGLU L2/3', 'level_3'] = 'TEGLU L2/3/4'

# change TEGLU 1/2/3 to TEGLU 2/3
adata.obs.loc[adata.obs['level_3'] == 'TEGLU L1/2/3', 'level_3'] = 'TEGLU L2/3'

# change TEGLU 4/5 to TEGLU 5
adata.obs.loc[adata.obs['level_3'] == 'TEGLU L4/5', 'level_3'] = 'TEGLU L5'

adata.obs.level_3 = adata.obs.level_3.astype('category')

In [None]:
del adata.uns['rank_genes_groups_filtered']

In [None]:
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
adata.write_h5ad(f"{out_path}/{date}-Brain-RIBOmap-3mad-ct-final.h5ad")

## Test

In [None]:
# Sort category
level_1_order = ['Neuronal cell', 'Glia', 'Mix']
level_2_order = ['Telencephalon projecting neurons', #0
                'Telencephalon interneurons', #1 
                'Cholinergic, monoaminergic and peptidergic neurons', #2
                'Di/Mesencephalon neurons', #3
                'Astrocyte', #4
                'Oligodendrocyte', #5
                'Oligodendrocytes precursor cell', #6
                'Microglia', #7
                'Vascular cells', #8
                'Astroependymal cells', #9
                'Perivascular macrophages', #10   
                 'Mix'
]


level_3_order = [
    'Astro1',
    'Astro2',
    'Astro3',
    'Astro4',
    'CHOR',
    'DECHO',
    'DEGLU 1',
    'DEGLU 2',
    'DGGRC',
    'EPEN',
    'HYPEP',
    'Inh Pvalb 1',
    'Inh Pvalb 2',
    'Inh Sst',
    'MSN',
    'Micro',
    'Mix',
    'OPC',
    'Oligo1',
    'Oligo2',
    'PVM1',
    'PVM2',
    'Peri/VEC1',
    'Peri/VEC2',
    'TECHO 1',
    'TECHO 2',
    'TEGLU COA',
    'TEGLU CA1',
    'TEGLU CA2',
    'TEGLU CA3',
    'TEGLU L1/2/3',
    'TEGLU L2/3',
    'TEGLU L4/5',
    'TEGLU L6',
    'TEGLU L6a',
    'TEGLU Mix',
    'TEGLU PIR',
    'VLMC',
    'VSMC',
]

adata.obs['level_1'] = adata.obs['level_1'].astype('category')
adata.obs['level_1'].cat.reorder_categories(level_1_order, inplace=True)

adata.obs['level_2'] = adata.obs['level_2'].astype('category')
adata.obs['level_2'].cat.reorder_categories(level_2_order, inplace=True)

adata.obs['level_3'] = adata.obs['level_3'].astype('category')
adata.obs['level_3'].cat.reorder_categories(level_3_order, inplace=True)

In [None]:
# Check color legend
level_1_pl = sns.color_palette(['#e8486d', '#4873e8', '#ebebeb'])
sns.palplot(level_1_pl, size=3)
plt.xticks(range(len(level_1_order)), level_1_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

In [None]:
# Check color legend (old coloring scheme)
level_2_colors = ['#256b00', 
                  '#ee750a', 
                  '#f280cf', 
                  '#f24f4b', 
                  '#e8e879', 
                  '#a8e1eb', 
                  '#667872', 
                  '#23ccb8', 
                  '#395ba8', 
                  '#697491',
                  '#8803fc',
                  '#ebebeb'
                 ]
level_2_pl = sns.color_palette(level_2_colors)
sns.palplot(level_2_pl, size=3)
plt.xticks(range(len(level_2_order)), level_2_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

adata.obs['level_2'] = adata.obs['level_2'].cat.reorder_categories(level_2_order)

In [None]:
# Check color legend (old coloring scheme)
level_3_colors = [
    
'#eaeaa2',
'#bcbc5e',
'#a6a64d',
'#737337',
'#7f52a9',
'#b5368e',
'#fa8380',
'#ed5e5b',
'#295029',
'#c4b0d4',
'#96066a',
'#ee750a',
'#b76319',
'#fead65',
'#7aecf8',
'#8597c6',
'#cccccc',
'#667872',
'#a8e1eb',
'#9ed9e4',
'#b274e8',
'#501087',
'#d3a59c',
'#c49c94',
'#f29ed8',
'#e883c9',
'#317531',
'#77ed8f',
'#28330b',
'#ffd438',
'#c4ff45',
'#9ee800',
'#32a630',
'#316e10',
'#406e27',
'#c5fcc5',
'#82ad2d',
'#1f76b3',
'#774d44',
                 ]
level_3_pl = sns.color_palette(level_3_colors)
sns.palplot(level_3_pl, size=3)
plt.xticks(range(len(level_3_order)), level_3_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

adata.obs['level_3'] = adata.obs['level_3'].cat.reorder_categories(level_3_order)

In [None]:
sc.pl.umap(adata, color='level_3', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 3', palette=level_3_pl, save=False)

In [None]:
sc.pl.umap(adata, color='level_2', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 2', palette=level_2_pl, save=False)

In [None]:
sc.pl.umap(adata, color='level_1', legend_loc='right margin',
           legend_fontsize=12, legend_fontoutline=2, frameon=False, 
           title=f'Level 1', palette=level_1_pl, save=False)

In [None]:
sdata = adata[adata.obs['protocol-replicate'] == 'STARmap-rep2', ]

reads_per_gene_df = pd.DataFrame(index=sdata.var.index)
for current_type in sdata.obs.level_2.cat.categories:
    
    current_vec = sdata[sdata.obs['level_2'] == current_type, ].layers['raw'].sum(axis=0)
    reads_per_gene_df[f'{current_type}'] = current_vec
    

In [None]:
reads_per_gene_df.loc['Ctss', :]

In [None]:
reads_per_gene_df.to_csv(os.path.join(out_path, f'{date}-Brain-STARmap-gene-total-count-level2.csv'))