In [50]:
import scanpy as sc
#from .autonotebook import tqdm as notebook_tqdm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [53]:
# 
h5ad_file = "saturn_results/test256_data_at_ne_os_pt_sm_ss_v2_org_saturn_seed_0.h5ad"

In [54]:
atlas_ad = sc.read_h5ad(h5ad_file)

In [57]:
atlas_ad.obs.head()

Unnamed: 0,labels,labels2,ref_labels,species
shoot1_AAACCTGAGCGTCAAG,at_EC Pavement Cell,EC Pavement Cell,EC Pavement Cell,at
shoot1_AAACCTGAGGTTCCTA,at_EC Pavement Cell,EC Pavement Cell,EC Pavement Cell,at
shoot1_AAACCTGAGTAATCCC,at_MC,MC,MC,at
shoot1_AAACCTGAGTACCGGA,at_EC Pavement Cell,EC Pavement Cell,EC Pavement Cell,at
shoot1_AAACCTGAGTCGATAA,at_VC Xylem Tracheary Element,VC Xylem Tracheary Element,VC Xylem Tracheary Element,at


# Data preprocessing

In [None]:
sc.pp.pca(atlas_ad)
sc.pp.neighbors(atlas_ad)

In [None]:
sc.tl.umap(atlas_ad, n_components=2)

In [None]:
## Clustering
sc.tl.leiden(atlas_ad, resolution=0.1)

In [None]:
# Save result
atlas_ad.write_h5ad("output/atlas.h5ad")

# Visualize data distribution

In [None]:
# 设置图形参数
sc.set_figure_params(dpi=72, color_map = 'viridis_r',figsize=[8,8] )
sc.settings.verbosity = 1
sc.logging.print_header()

## By species

In [None]:
sc.pl.umap(atlas_ad, color="species", projection="2d", palette='Set1')

## Visualize each species individually

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import math

# 获取所有不同的物种
species = atlas_ad.obs['species'].unique()

# 计算行数和列数
num_species = len(species)
num_cols = 4
num_rows = math.ceil(num_species / num_cols)

# 打开一个PDF文件
with PdfPages('output/species_plots.pdf') as pdf:
    # 逐个物种绘制
    for i in range(0, num_species, num_cols * num_rows):
        # 创建一个新的图形，并设置子图布局
        fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 3))
        axes = axes.flatten()  # 将二维的 axes 数组展平为一维

        # 设置颜色映射和默认的颜色映射
        sc.set_figure_params(color_map='Set3')
        for j, highlight_species in enumerate(species[i:i + num_cols * num_rows]):
            color_map = {}
            for sp in species:
                if sp == highlight_species:
                    color_map[sp] = "red"
                else:
                    color_map[sp] = "gray"
            # 绘制UMAP图
            sc.pl.umap(atlas_ad, color='species', title=f'{highlight_species}', palette=color_map, legend_loc=None, show=False, ax=axes[j])

        # 隐藏多余的子图
        for k in range(j + 1, num_rows * num_cols):
            axes[k].axis('off')

        # 调整子图之间的间距并保存页面
        plt.tight_layout()
        pdf.savefig(fig)  # 保存当前页
        plt.close(fig)  # 关闭当前图形，释放内存


## By original labels

In [None]:
sc.pl.umap(atlas_ad, color="labels2")

## Visualize each cluster individually

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import math

# 获取所有的聚类簇和物种
clusters = atlas_ad.obs['labels'].unique()
species = atlas_ad.obs['species'].unique()

# 获取需要绘制的聚类簇
all_clusters = clusters.to_list()

# 计算行数和列数
num_clusters = len(all_clusters)
num_cols = 4
num_rows = math.ceil(num_clusters / num_cols)

# 打开一个PDF文件
with PdfPages('output/cluster_plots.pdf') as pdf:
    # 逐个聚类簇绘制
    for i in range(0, num_clusters, num_cols * num_rows):
        # 创建一个新的图形，并设置子图布局
        fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 3))
        axes = axes.flatten()  # 将二维的 axes 数组展平为一维

        # 设置颜色映射和默认的颜色映射
        sc.set_figure_params(color_map='Set3')
        for j, highlight_cluster in enumerate(all_clusters[i:i + num_cols * num_rows]):
            color_map = {}
            for cluster in all_clusters:
                if cluster == highlight_cluster:
                    color_map[cluster] = "red"
                else:
                    color_map[cluster] = "gray"
            # 绘制UMAP图
            sc.pl.umap(atlas_ad, color='labels', title=f'{highlight_cluster}', palette=color_map, legend_loc=None, show=False, ax=axes[j])

        # 隐藏多余的子图
        for k in range(j + 1, num_rows * num_cols):
            axes[k].axis('off')

        # 调整子图之间的间距并保存页面
        plt.tight_layout()
        pdf.savefig(fig)  # 保存当前页
        plt.close(fig)  # 关闭当前图形，释放内存


## Species composition in different clusters

In [None]:
sc.pl.umap(adata=atlas_ad, color='leiden', legend_loc='on data')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 将数据转换为DataFrame
data = pd.DataFrame(atlas_ad.obs)

# 计算每个聚类中每种物种的计数
count_table = data.groupby(['leiden', 'species']).size().unstack(fill_value=0)

# 将计数转换为比例
proportion_table = count_table.div(count_table.sum(axis=1), axis=0)

# 绘制堆叠柱状图
ax = proportion_table.plot(kind='bar', stacked=True, figsize=(10, 6), width=.8)

# 设置图形的标题和标签
ax.set_title('Proportion of Species in Each Cluster')
ax.set_xlabel('Cluster')
ax.set_ylabel('Proportion')
ax.grid(False)
# 显示图例
ax.legend(title='Species', bbox_to_anchor=(1.05, 1), loc='upper left')

# 显示图形
plt.tight_layout()
plt.show()


# Similarity evaluation

In [106]:
%run scripts/utils.py

In [111]:
all_nbrs = cross_species_knn_all(atlas_ad, col='labels', metric='cosine', space='raw', k=5,
                                     species='at', consider_same_species=False)

In [115]:
df = pd.DataFrame(all_nbrs, columns=['at','other'])

In [118]:
df = df.sort_values('at')

# Macrogene differential expression

In [126]:
import pickle

In [127]:
with open("saturn_results/test256_data_at_ne_os_pt_sm_ss_v2_org_saturn_seed_0_genes_to_macrogenes.pkl", "rb") as f:
    macrogene_weights = pickle.load(f)

In [128]:
# macrogene weights is a dictionary of (species_{gene name}) : [gene to macrogen weight](1x2000)
len(macrogene_weights)

48000

In [104]:
# Create a copy of the adata with macrogenes as the X values
macrogene_adata = sc.AnnData(atlas_ad.obsm["macrogenes"])
macrogene_adata.obs = atlas_ad.obs


In [131]:
# Rows are cells, columns are macrogenes, each value corresponds to the gene weight
macrogene_adata.shape

(98396, 2000)

In [None]:
macrogene_adata.obs

In [134]:
# Differential analysis based on specified group
sc.tl.rank_genes_groups(macrogene_adata, groupby="leiden", groups=["11"], method="wilcoxon")

  return reduction(axis=axis, out=out, **passkwargs)


In [None]:
sc.pl.rank_genes_groups(macrogene_adata)

In [None]:
sc.pl.rank_genes_groups_dotplot(macrogene_adata,swap_axes=True)

In [None]:
de_df = sc.get.rank_genes_groups_df(macrogene_adata, group="11").head(20)
de_df

In [137]:
def get_scores(macrogene):
    '''
    Given the index of a macrogene, return the scores by gene for that centroid
    '''
    scores = {}
    for (gene), score in macrogene_weights.items():
        scores[gene] = score[int(macrogene)]
    return scores

In [138]:
macrogene = 891
df = pd.DataFrame(get_scores(macrogene).items(), columns=["gene", "weight"])\
        .sort_values("weight", ascending=False)

In [None]:
for macrogene in de_df["names"]:
    print(f"Macrogene {macrogene}")
    df = pd.DataFrame(get_scores(macrogene).items(), columns=["gene", "weight"])\
            .sort_values("weight", ascending=False)
    #df.reset_index(inplace=True)
    # get the rank of the gene in df and print it
    #print(df[df["gene"] == gene].index[0])
    display(df.head(20))

