# Label transfer

2022-11-10

In [None]:
# Import Packages

%load_ext autoreload
%autoreload 2

import os
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import anndata as ad
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from anndata import AnnData
from natsort import natsorted
from tqdm.notebook import tqdm

# Customized packages
import starmap.sc_util as su
# test()

## Set path

In [None]:
# Set path
base_path = 'Z:/Data/Analyzed/2022-09-05-Hu-Tissue/'

input_path = os.path.join(base_path, 'input')

out_path = os.path.join(base_path, 'output')
if not os.path.exists(out_path):
    os.mkdir(out_path)
    
fig_path = os.path.join(base_path, 'figures')
if not os.path.exists(fig_path):
    os.mkdir(fig_path)

sc.settings.figdir = fig_path

In [None]:
# laod combined file
cdata = sc.read_h5ad(os.path.join(out_path, '2022-10-23-Brain-combined-3mad-harmony.h5ad'))
cdata

In [None]:
# laod ribomap file
rdata = sc.read_h5ad(os.path.join(out_path, '2022-11-13-Brain-RIBOmap-3mad-ct-final.h5ad'))
rdata

In [None]:
cdata.obs['protocol'].value_counts()

## Copy ribo label

In [None]:
cdata.obs['level_1'] = 'NA'
cdata.obs['level_2'] = 'NA'
cdata.obs['level_3'] = 'NA'

cdata.obs.loc[cdata.obs['protocol'] == 'RIBOmap', 'level_1'] = rdata.obs['level_1'].values
cdata.obs.loc[cdata.obs['protocol'] == 'RIBOmap', 'level_2'] = rdata.obs['level_2'].values
cdata.obs.loc[cdata.obs['protocol'] == 'RIBOmap', 'level_3'] = rdata.obs['level_3'].values

In [None]:
# reclassify starmap cells 
ref_cells = cdata.obs.loc[cdata.obs['protocol'] == 'RIBOmap', :].index
query_cells = cdata.obs.loc[cdata.obs['protocol'] == 'STARmap', :].index

ref_cell_loc = cdata[ref_cells, :].obsm['X_umap']
query_cell_loc = cdata[query_cells, :].obsm['X_umap']

ref_cell_annot = cdata.obs.loc[ref_cells, 'level_3'].values

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=100)
neigh.fit(ref_cell_loc, ref_cell_annot)
query_cell_predicted = neigh.predict(query_cell_loc)

In [None]:
# update to adata
cdata.obs.loc[query_cells, 'level_3'] = query_cell_predicted

In [None]:
# Check color legend (old coloring scheme)

level_2_order = ['Telencephalon projecting neurons', #0
                'Telencephalon interneurons', #1 
                'Cholinergic, monoaminergic and peptidergic neurons', #2
                'Di/Mesencephalon neurons', #3
                'Astrocyte', #4
                'Oligodendrocyte', #5
                'Oligodendrocytes precursor cell', #6
                'Microglia', #7
                'Vascular cells', #8
                'Astroependymal cells', #9
                'Perivascular macrophages', #10   
                 'Mix'
]

level_2_colors = ['#256b00', 
                  '#ee750a', 
                  '#f280cf', 
                  '#f24f4b', 
                  '#e8e879', 
                  '#a8e1eb', 
                  '#667872', 
                  '#23ccb8', 
                  '#395ba8', 
                  '#697491',
                  '#8803fc',
                  '#ebebeb'
                 ]
level_2_pl = sns.color_palette(level_2_colors)
sns.palplot(level_2_pl, size=3)
plt.xticks(range(len(level_2_order)), level_2_order, size=10, rotation=45)
plt.tight_layout()
# plt.savefig(f'./figures/color_legend_top.png')
plt.show()

cdata.obs['level_2'] = cdata.obs['level_2'].astype(object)
cdata.obs['level_2'] = cdata.obs['level_2'].astype('category')
cdata.obs['level_2'] = cdata.obs['level_2'].cat.reorder_categories(level_2_order)

In [None]:
fig, ax = plt.subplots(figsize=(7,5))
ax = sc.pl.umap(cdata, show=False, color=None, alpha=1, size=(120000 / cdata.n_obs), ax=ax, title='', palette=sns.color_palette(['#fafafa']))
sc.pl.umap(cdata[cdata.obs['protocol'] == 'RIBOmap'], color='level_2', frameon=True, ax=ax, legend_loc='right margin', size=(120000 / cdata.n_obs),
       title='', show=False, palette=level_2_pl)

In [None]:
fig, ax = plt.subplots(figsize=(7,5))
ax = sc.pl.umap(cdata, show=False, color=None, alpha=1, size=(120000 / cdata.n_obs), ax=ax, title='', palette=sns.color_palette(['#fafafa']))
sc.pl.umap(cdata[cdata.obs['protocol'] == 'STARmap'], color='level_2', frameon=True, ax=ax, legend_loc='right margin', size=(120000 / cdata.n_obs),
       title='', show=False, palette=level_2_pl)

In [None]:
# check cluster distribution
import matplotlib.patches as mpatches
leiden_df = pd.DataFrame(cdata.obs.groupby('level_2')['protocol-replicate'].value_counts().values)
leiden_df.columns = ['counts']
leiden_df['level_2'] = [i[0] for i in cdata.obs.groupby('level_2')['protocol-replicate'].value_counts().index]
leiden_df['protocol-replicate'] = [i[1] for i in cdata.obs.groupby('level_2')['protocol-replicate'].value_counts().index]
sum_counts = leiden_df.groupby('level_2')['counts'].sum().to_dict()
leiden_df['percentage'] = [leiden_df.loc[i, 'counts'] / sum_counts[leiden_df.loc[i, 'level_2']] * 100 for i in range(leiden_df.shape[0])]

s_df = leiden_df.loc[leiden_df['protocol-replicate'] == 'STARmap-rep3', :]
s_df['percentage'] = 100
r_df = leiden_df.loc[leiden_df['protocol-replicate'] == 'RIBOmap-rep3', :]
r_2_df = leiden_df.loc[leiden_df['protocol-replicate'] == 'RIBOmap-rep2', :]
r_2_df['percentage'] = r_2_df['percentage'].values + r_df['percentage'].values

fig, ax = plt.subplots(figsize=(15, 10))
bar1 = sns.barplot(x="level_2",  y="percentage", data=s_df, color='#fcad03')
bar2 = sns.barplot(x="level_2", y="percentage", data=r_2_df, color='#03fc35')
bar2 = sns.barplot(x="level_2", y="percentage", data=r_df, color='#035efc')

# add legend
top_bar = mpatches.Patch(color='#fcad03', label='STARmap-rep3')
middle_bar = mpatches.Patch(color='#035efc', label='RIBOmap-rep3')
bottom_bar = mpatches.Patch(color='#03fc35', label='RIBOmap-rep2')
plt.legend(handles=[top_bar, middle_bar, bottom_bar])

# show the graph
plt.xticks(rotation=45)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
sns.barplot(x='level_2', y='percentage', hue='protocol-replicate', data=leiden_df)
plt.xticks(rotation=45)
plt.show()

In [None]:
current_sample = 'RIBOmap-rep3'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue='level_2', legend=None,
                    palette=level_2_pl,
                    data=cdata.obs.loc[cdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)
g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
plt.show()
# plt.savefig(os.path.join(fig_path, f'{current_sample}-level2-spatial-map.png'))

In [None]:
current_sample = 'RIBOmap-rep2'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue='level_2', legend=None,
                    palette=level_2_pl,
                    data=cdata.obs.loc[cdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)
g.set_title(current_sample)
g.invert_xaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
plt.show()
# plt.savefig(os.path.join(fig_path, f'{current_sample}-level2-spatial-map.png'))

In [None]:
current_sample = 'STARmap-rep3'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue='level_2', legend=None,
                    palette=level_2_pl,
                    data=cdata.obs.loc[cdata.obs['protocol-replicate'] == current_sample, :], 
                    s=10,
                    ax=ax)
g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
plt.show()
# plt.savefig(os.path.join(fig_path, f'{current_sample}-level2-spatial-map.png'))

In [None]:
# backup 
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
cdata.write_h5ad(f"{out_path}/{date}-Brain-combined-3mad-ct-final.h5ad")

## Copy region label 

In [None]:
# laod combined file
cdata = sc.read_h5ad(os.path.join(out_path, '2022-11-13-Brain-combined-3mad-ct-final.h5ad'))
cdata

In [None]:
cdata.obs['region'] = 'NA'
cdata.obs['region_color'] = 'NA'

cdata.obs.loc[cdata.obs['replicate'] == 'rep3', 'region'] = region_df['region'].values
cdata.obs.loc[cdata.obs['replicate'] == 'rep3', 'region_color'] = region_df['region_color'].values

In [None]:
# load zefang's file
region_df = pd.read_csv('Z:/Data/Analyzed/2022-09-05-Hu-Tissue/output/2022-11-11-Brain-combined-3mad-ct-v2_region_label_obs.csv', index_col=0)
region_df

In [None]:
# backup 
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
cdata.write_h5ad(f"{out_path}/{date}-Brain-combined-3mad-ct-final.h5ad")

In [None]:
sdata = cdata[cdata.obs['protocol-replicate'] == 'RIBOmap-rep2', ]
sdata

In [None]:
# reads_per_gene_df = sdata.var.copy()
reads_per_gene_df['total_reads_RIBOmap-rep2'] = sdata.layers['raw'].sum(axis=0)
reads_per_gene_df

In [None]:
reads_per_gene_df = reads_per_gene_df.loc[:, ['total_reads_RIBOmap-rep2', 'total_reads_RIBOmap-rep3', 'total_reads_STARmap-rep3']]

In [None]:
reads_per_gene_df.to_csv(os.path.join(out_path, f'{date}-Brain-combined-gene-total-count.csv'))

## STARmap manual clustering

In [None]:
# subset 
sdata = sc.read_h5ad(os.path.join(out_path, '2022-10-23-Brain-STARmap-rep3-3mad-filtered.h5ad'))
sdata.obs['protocol-replicate'] = sdata.obs['protocol'].astype(str) + '-' + sdata.obs['replicate'].astype(str)
sdata

In [None]:
%%time
# preprocessing
sdata.var['highly_variable'] = sdata.var['max_counts_sample'] > 2
sdata.var['highly_variable'].value_counts()

sdata.X = sdata.layers['raw'].copy()

sc.pp.normalize_total(sdata)
sc.pp.log1p(sdata)
sdata.raw = sdata
sc.pp.scale(sdata)
sdata.layers['scaled'] = sdata.X.copy()
sc.pp.regress_out(sdata, ['total_counts'])
sdata.layers['corrected'] = sdata.X.copy()

# Run PCA
sdata.X = sdata.layers['corrected'].copy()
sc.tl.pca(sdata, svd_solver='full', use_highly_variable=True)

# Plot explained variance 
sc.pl.pca_variance_ratio(sdata, log=False)

# Plot PCA
sc.pl.pca(sdata, color='protocol-replicate')

In [None]:
# Computing the neighborhood graph
sc.pp.neighbors(sdata, n_neighbors=50, n_pcs=30, random_state=0)

# Plot UMAP with cluster labels 
min_dist = 0.0001
spread = 5
sc.tl.umap(sdata, min_dist=min_dist, spread=spread)

In [None]:
%%time
# Run leiden cluster
cluster_resolution = 3
sc.tl.leiden(sdata, resolution = cluster_resolution)
sc.pl.umap(sdata, color='leiden')

In [None]:
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
clustering_out_path = os.path.join(fig_path, f'{date}-clustering-starmap')
if not os.path.exists(clustering_out_path):
    os.mkdir(clustering_out_path)

In [None]:
# Get colormap
n_clusters = sdata.obs['leiden'].unique().shape[0]

cluster_pl = sns.color_palette("husl", n_clusters)
cluster_cmap = ListedColormap(cluster_pl.as_hex())
sns.palplot(cluster_pl)

In [None]:
# Plot UMAP with cluster labels w/ new color
fig, ax = plt.subplots(figsize=(10, 7))
sc.pl.umap(sdata, color='leiden', legend_loc='on data', palette=cluster_pl,
           legend_fontsize=10, legend_fontoutline=2, frameon=False, ax=ax)

In [None]:
# Get markers for each cluster
sc.tl.rank_genes_groups(sdata, 'leiden', method='wilcoxon')
sc.tl.filter_rank_genes_groups(sdata, min_in_group_fraction=0.15, max_out_group_fraction=0.85)

In [None]:
# Save log
with open(f'{clustering_out_path}/log_r_{cluster_resolution}.txt', 'w') as f:
    f.write(f"""
Resolution: {cluster_resolution}
Number of clusters: {n_clusters}
UMAP min_dist: {min_dist}
UMAP spread: {spread}""")
    
# save results
with pd.ExcelWriter(os.path.join(clustering_out_path, f'clustering_markers_r_{cluster_resolution}.xlsx'), mode='w') as writer:  
    for current_cell_type in sdata.obs.leiden.cat.categories:
        current_df = sc.get.rank_genes_groups_df(sdata, group=current_cell_type, key='rank_genes_groups')
        current_df.to_excel(writer, sheet_name=f'{current_cell_type}')
        
with pd.ExcelWriter(os.path.join(clustering_out_path, f'clustering_markers_r_{cluster_resolution}_filtered.xlsx'), mode='w') as writer:  
    for current_cell_type in sdata.obs.leiden.cat.categories:
        current_df = sc.get.rank_genes_groups_df(sdata, group=current_cell_type, key='rank_genes_groups_filtered')
        current_df.to_excel(writer, sheet_name=f'{current_cell_type}')
        
# save obs
sdata.obs.to_csv(os.path.join(clustering_out_path, f'obs_r_{cluster_resolution}.csv'))

In [None]:
# plot summary plot for each cluster
sm_out_path = os.path.join(clustering_out_path, f'r_{cluster_resolution}_summary')
if not os.path.exists(sm_out_path):
    os.mkdir(sm_out_path)
    
for i, current_cluster in enumerate(tqdm(sorted(sdata.obs['leiden'].unique()))):
    
    # get dfs 
    df1 = sdata.obs.loc[sdata.obs['protocol-replicate'] == 'STARmap-rep3', :]

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(13, 5))
    axs = axs.flatten()


    # plot1
    g1 = sns.scatterplot(x='column', y='row', color='#1111', 
                        data=df1, 
                        s=5,
                        ax=axs[0])

    g1.set_title('STARmap-rep3')
    g1.invert_yaxis()
    g1.axes.xaxis.set_visible(False)
    g1.axes.yaxis.set_visible(False)


    h1 = sns.scatterplot(x='column', y='row', hue='leiden', legend=None,
                        palette=cluster_pl,
                        data=df1.loc[df1['leiden'] == current_cluster, ], 
                        s=5,
                        ax=axs[0])

    # umap1
    ax = sc.pl.umap(sdata, show=False, color=None, alpha=1, size=(120000 / sdata.n_obs), ax=axs[1], title='', palette=sns.color_palette(['#fafafa']))
    sc.pl.umap(sdata[(sdata.obs["leiden"] == current_cluster) & (sdata.obs['protocol-replicate'] == 'STARmap-rep3')], color='leiden', frameon=False, ax=ax, legend_loc=None, size=(120000 / sdata.n_obs),
           title='', show=False, palette=sns.color_palette([sdata.uns['leiden_colors'][int(current_cluster)]]))

    plt.savefig(os.path.join(sm_out_path, f'cluster_{current_cluster}.jpeg'))

In [None]:
current_sample = 'STARmap-rep3'

fig, ax = plt.subplots(figsize=(15, 20))
g = sns.scatterplot(x='column', y='row', hue='leiden', 
                    palette=cluster_pl,
                    data=sdata.obs.loc[sdata.obs['protocol-replicate'] == current_sample, :], 
                    s=5,
                    ax=ax)
g.set_title(current_sample)
g.invert_yaxis()
g.axes.xaxis.set_visible(False)
g.axes.yaxis.set_visible(False)
plt.savefig(os.path.join(sm_out_path, f'r_{cluster_resolution}.jpeg'))

In [None]:
sc.pl.umap(sdata, color='H2-Aa')