# cNMF on myeloid cells in atlas

FROM CNMF PAPER:
https://github.com/dylkot/cNMF/blob/main/Tutorials/analyze_pbmc_example_data.ipynb,
https://github.com/dylkot/cNMF/blob/main/Tutorials/analyze_batcheffectcorrect_BaronEtAl.ipynb

FROM GLIOMA PAPER:
https://github.com/BernsteinLab/Myeloid-Glioma/tree/main/Identifying%20recurrent%20programs%20in%20Myeloid%20Cells%20in%20Gliomas%20(Related%20to%20Figure%201)


## 1. Import

In [None]:
import os
import contextlib
import pickle

import pandas as pd
import numpy as np
import scanpy as sc
from cnmf import cNMF, Preprocess
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, fcluster
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from matplotlib.patches import Patch

np.random.seed(14)

In [None]:
output_directory = 'cNMF_w_filtered_genes'
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

## 2. Load filtered counts

In [None]:
filtered_counts_dir = os.path.join(output_directory, 'filtered_counts.h5ad')
adata = sc.read_h5ad(filtered_counts_dir)

## 3. Correcting counts for batch effect

In [None]:
# raw counts needed
adata.layers['norm_counts'] = adata.X.copy()
adata.X = adata.layers['counts'].copy()

In [None]:
numhvgenes=2000

In [None]:
p = Preprocess(random_seed=14)
(adata_c, adata_tp10k, hvgs) = p.preprocess_for_cnmf(adata, 
                                                     harmony_vars='dataset', 
                                                     n_top_rna_genes = numhvgenes,
                                                     max_scaled_thresh = None, 
                                                     quantile_thresh = .9999, 
                                                     makeplots=False,
                                                    save_output_base=os.path.join(output_directory, f"results"))

## 4. Save

### 4.1 Save counts.h5ad objects

In [None]:
corrected_count_adat_fn = os.path.join(output_directory, 'corrected_counts.h5ad')
sc.write(corrected_count_adat_fn, adata_c)

### 4.2 Create a counts.h5ad object for each cohort

In [None]:
cohorts = adata_c.obs['dataset'].unique()

In [None]:
for cohort in cohorts:
    # Directory for the cohort
    cohort_dir = os.path.join(output_directory, cohort)
    if not os.path.exists(cohort_dir):
        os.mkdir(cohort_dir)

    adata_cohort = adata_c[adata_c.obs['dataset'] == cohort].copy()

    output_h5ad_path = os.path.join(cohort_dir, f"counts.h5ad")
    adata_cohort.write(output_h5ad_path)

    print(f"✔️ Saved: {output_h5ad_path}")

## 5. cNMF cohort wise

### 5.1 Set parameters

In [None]:
# Number of NMF replicates
numiter=150 

## Number of over-dispersed genes to use for running the actual factorizations
numhvgenes=2000 

## Specify the Ks to use as a space separated list
Ks = np.arange(7,20)

## Specify a seed pseudorandom number generation for reproducibility
seed = 14 

name='cNMF_2000hvg'

numworkers = 28

In [None]:
corrected_count_adat_fn = os.path.join(output_directory, 'corrected_counts.h5ad')
adata = sc.read_h5ad(corrected_count_adat_fn)

In [None]:
cohorts = adata.obs['dataset'].unique()

### 5.2 Compute cNMF cohort-wise

If there are cells with zero over-dispersed genes the function prepare falls into error.
Here we implement a new prepare function with filtering of these cells:

In [None]:
def prepare_cnmf_with_filtering(count_adat_fn, cohort_dir, Ks, numiter, seed, numhvgenes):
    cnmf_obj = cNMF(output_dir=cohort_dir, name=name)

    try:
        cnmf_obj.prepare(
            counts_fn=count_adat_fn,
            components=Ks,
            n_iter=numiter,
            seed=seed,
            num_highvar_genes=numhvgenes
        )
    except Exception as e:
        if "zero counts of overdispersed genes" in str(e):
            print("Some cells have zero expression in HVGs. Filtering them using saved HVG list...")

            # Step 1 – load data
            adata = sc.read_h5ad(count_adat_fn)
            print(f"Original shape: {adata.shape}")

            # Step 2 – read HVG list from file
            hvg_file = os.path.join(cohort_dir, name, f"{name}.overdispersed_genes.txt")
            if not os.path.isfile(hvg_file):
                raise FileNotFoundError(f"HVG list not found: {hvg_file}")
            with open(hvg_file) as f:
                hvg_genes = [line.strip() for line in f if line.strip() in adata.var_names]

            print(f"✔️ Loaded {len(hvg_genes)} HVGs from {hvg_file}")

            # Step 3 – sum expression across HVGs
            adata_hvg = adata[:, hvg_genes]
            X = adata_hvg.X.toarray() if hasattr(adata_hvg.X, "toarray") else adata_hvg.X
            cell_sums = np.array(X.sum(axis=1)).flatten()
            zero_cells = adata_hvg.obs_names[cell_sums == 0]

            print(f"Found {len(zero_cells)} cells with zero HVG expression.")

            # Step 4 – filter and save
            adata_filtered = adata[~adata.obs_names.isin(zero_cells)]
            print(f"Filtered shape: {adata_filtered.shape}")

            filtered_fn = os.path.join(cohort_dir, "counts_filtered.h5ad")
            adata_filtered.write(filtered_fn)
            print(f"✔️ Saved filtered file to {filtered_fn}")

            # Step 5 – retry prepare
            cnmf_obj.prepare(
                counts_fn=filtered_fn,
                components=Ks,
                n_iter=numiter,
                seed=seed,
                num_highvar_genes=numhvgenes
            )
        else:
            raise e

    print("✔️ Prepare completed.")
    return cnmf_obj


In [None]:
# Looping on cohorts
for cohort in cohorts:
    print(f"\n--- Computing cNMF for cohort: {cohort} ---")

    # Directory for the cohort
    cohort_dir = os.path.join(output_directory, cohort)
    count_adat_fn = os.path.join(cohort_dir, f"counts.h5ad")
    
    # Initialize the cnmf object that will be used to run analyses
    cnmf_obj = cNMF(output_dir=cohort_dir, name=name)

    # Prepare the data, I.e. subset to 2000 high-variance genes, and variance normalize
    cnmf_obj = prepare_cnmf_with_filtering(
        count_adat_fn=count_adat_fn,
        cohort_dir=cohort_dir,
        Ks=Ks,
        numiter=numiter,
        seed=seed,
        numhvgenes=numhvgenes
    )

    log_dir = os.path.join(cohort_dir, f"factorize.log")
    with open(log_dir, "w") as log:
        with contextlib.redirect_stdout(log), contextlib.redirect_stderr(log):
            cnmf_obj.factorize_multi_process(total_workers=numworkers)
    print(f"✔️ Factorize-module completed")

    # combines the individual factorization replicates so that the consensus estimate can be taken
    cnmf_obj.combine()
    print(f"✔️ Combine-module completed")

    # Compute the stability and error at each choice of K to see if a clear choice jumps out
    cnmf_obj.k_selection_plot(close_fig=False)

    print(f"✔️ cNMF completed for cohort {cohort}. Results in: {cohort_dir}")


#### 5.2.1 Optimal K

The optimal K parameter is chosen looking at a good compromise between the minimum reconstruction error and maximum stability.

In [None]:
def load_k_selection_df(npz_path):
    npz = np.load(npz_path, allow_pickle=True)
    data = npz['data']
    columns = npz['columns']
    index = npz['index']
    
    # Ricostruisce il DataFrame
    df = pd.DataFrame(data, columns=columns)
    df.index = index
    return df

In [None]:
def select_best_k_combined(df, cohort, weight_silhouette=0.5, plot=True):
    
    # Normalize silhouette 
    scaler = MinMaxScaler()
    df["silhouette_norm"] = scaler.fit_transform(df[["silhouette"]])
    
    # Re-sort and normalize prediction_error 
    df["error_inv"] = -df["prediction_error"]
    df["error_norm"] = scaler.fit_transform(df[["error_inv"]])
    
    # Combined score
    df["score"] = weight_silhouette * df["silhouette_norm"] + (1 - weight_silhouette) * df["error_norm"]

    # Select best k
    best_row = df.loc[df["score"].idxmax()]
    best_k = int(best_row["k"])

    if plot:
        import matplotlib.pyplot as plt
        plt.figure(figsize=(8,5))
        plt.plot(df["k"], df["score"], label="Combined Score", marker="*")
        plt.axvline(best_k, color="red", linestyle="--", label=f"Best k = {best_k}")
        plt.xlabel("k")
        plt.title(f"Automatic K selection - cohort {cohort}")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    print(f"✔️ Best k for cohort {cohort}: {best_k}")
    return best_k

In [None]:
for cohort in cohorts:
    print(f"\n--- Finding best K for cohort: {cohort} ---")

    # Directory 
    npz_path = os.path.join(output_directory, cohort, name, f"{name}.k_selection_stats.df.npz")
    df = load_k_selection_df(npz_path)

    selected_K = select_best_k_combined(df, cohort, weight_silhouette=0.5, plot=True)
    
    cohort_dir = os.path.join(output_directory, cohort)
    with open(os.path.join(cohort_dir, name, "selected_K.txt"), "w") as f:
        f.write(str(selected_K) + "\n")

In [None]:
density_threshold = 0.1

for cohort in cohorts:
    print(f"\n--- Running consensus for cohort {cohort} ---")

    cohort_dir = os.path.join(output_directory, cohort)

    with open(os.path.join(cohort_dir, name, "selected_K.txt")) as f:
        selected_K = int(f.read().strip())

    cnmf_obj = cNMF(output_dir=cohort_dir, name=name)
    
    cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold,
                       show_clustering=True, close_clustergram_fig=False)

    print(f"✔️ Consensus done for {cohort}")

## 6. Compute Consensus Programs with cosine similarity

### 6.1 Cosine similarity matrix (GEPs x GEPs)

In [None]:
all_spectra = []  # DataFrame list
program_labels = []  
density_threshold = 0.1

for cohort in cohorts:
    
    # load
    cohort_dir = os.path.join(output_directory, cohort)
    with open(os.path.join(cohort_dir, name, "selected_K.txt")) as f:
        selected_K = int(f.read().strip())
    spectra_path = os.path.join(cohort_dir, name,
                                f"{name}.gene_spectra_score.k_{selected_K}.dt_{str(f'{density_threshold}').replace('.', '_')}.txt")

    # create dataframe
    df = pd.read_csv(spectra_path, sep="\t", index_col=0).T # genes x geps
    for i in range(df.shape[1]):
        all_spectra.append(df.iloc[:, i])
        program_labels.append(f"{cohort}_k{i}")

In [None]:
# convert geps x genes
spectra_matrix = pd.DataFrame(all_spectra).T
spectra_matrix.columns = program_labels

# cosine similarity (gep x gep)
cos_sim_matrix = cosine_similarity(spectra_matrix.T)
cos_sim_df = pd.DataFrame(cos_sim_matrix, index=program_labels, columns=program_labels)

In [None]:
# save
directory = os.path.join(output_directory, "2000hvg")
if not os.path.exists(directory):
    os.mkdir(directory)
np.save(f"{directory}/cosine_similarity_matrix.npy", cos_sim_matrix)
with open(f"{directory}/program_labels.txt", "w") as f:
    for label in program_labels:
        f.write(f"{label}\n")
spectra_matrix.to_csv(f"{directory}/spectra_matrix.csv")

### 6.2 Hierarchical clustering 

In [None]:
cos_sim_matrix[cos_sim_matrix < 0] = 0 # negative values to 0

linkage_matrix = linkage(1 - cos_sim_matrix, method='ward')

# Heatmap 
sns.clustermap(cos_sim_df, row_linkage=linkage_matrix, col_linkage=linkage_matrix,
               cmap="Reds", figsize=(22, 22))

plt.suptitle("Cosine similarity between GEPs across cohorts (negatives zeroed)")
plt.show()

### 6.3 Cophenetic distance threshold

In [None]:
threshold = 3
clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')

cluster_dict = defaultdict(list)
for label, clust_id in zip(program_labels, clusters):
    cluster_dict[clust_id].append(label)

In [None]:
# extract cohort name using '_k' as stop string
cohort_names = [col.split("_k")[0] for col in cos_sim_df.columns]

# dictionary: unique color for each cohort
unique_cohorts = sorted(set(cohort_names))
palette = sns.color_palette("tab20", len(unique_cohorts))  
cohort_colors_dict = {cohort: palette[i] for i, cohort in enumerate(unique_cohorts)}

# apply colors
cohort_colors = [cohort_colors_dict[cohort] for cohort in cohort_names]

g = sns.clustermap(cos_sim_df,
                   row_linkage=linkage_matrix,
                   col_linkage=linkage_matrix,
                   row_colors=cohort_colors,
                   col_colors=cohort_colors,
                   cmap="Reds",
                   figsize=(13, 13),
                   dendrogram_ratio=(0.0001, 0.0001),  # remove dendrograms
                   cbar_pos=None  # disabilita la colorbar automatica
                  )

g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_yticks([])

# legend
legend_handles = [Patch(facecolor=cohort_colors_dict[c], label=c) for c in unique_cohorts]
g.ax_heatmap.legend(handles=legend_handles, title="Cohort", loc='lower right', bbox_to_anchor=(-0.05, 0.0), fontsize=12)

# horizontal lines for consensus programs
ordered_row_indices = g.dendrogram_row.reordered_ind
ordered_clusters = np.array(clusters)[ordered_row_indices]
boundaries = np.where(np.diff(ordered_clusters) != 0)[0]
for b in boundaries:
    g.ax_heatmap.hlines(b + 1, *g.ax_heatmap.get_xlim(), colors='black', linewidth=1.5)

# Consensus programs labels
boundaries_ext = np.concatenate([[-1], boundaries, [len(ordered_clusters) - 1]])
centers = [(boundaries_ext[i] + boundaries_ext[i + 1]) / 2 +1 for i in range(len(boundaries_ext) - 1)]
consensus_labels = ['- Cell cycle (activity)', #1
                   '- Immunosuppressive response - TAMs/LAMs (activity)', #2
                   '- Immunosuppressive response - M2-like Macrophages (identity)', #3
                   '- Immunosuppressive Macrophage Metabolism (activity)', #4
                   '- Mast cell (identity)', #5
                   '- Plasma / B cell activation (activity)', #6
                   '- Inflammatory - Neutrophils & Monocytes (mixed)', #7
                   '- Epithelial interaction (activity)', #8
                   '- cDC (identity)', #9
                   '- T cells activation (activity)', #10
                   '- Metabolism (activity)', #11
                   '- HS-UPR-stress response (activity)', #12
                   '- IFN response (activity)', #13
                   '- Hypoxia - metallic stress (activity)', #14
                   '- EMT (activity)', #15
                   '- DC mature (identity)', #16
                   '- pDC (identity)'] #17

for y, label in zip(centers, consensus_labels):
    g.ax_heatmap.text(x=g.ax_heatmap.get_xlim()[1] + 2,
                      y=y,
                      s=label,
                      va='center',
                      ha='left',
                      fontsize=12)


# colorbar
norm = plt.Normalize(vmin=cos_sim_df.values.min(), vmax=cos_sim_df.values.max())
sm = plt.cm.ScalarMappable(cmap="Reds", norm=norm)
sm.set_array([])
cbar_ax = g.fig.add_axes([-0.09, 0.55, 0.02, 0.25]) 
cb = g.fig.colorbar(sm, cax=cbar_ax, orientation='vertical')
cbar_ax.tick_params(labelsize=10)
cbar_ax.set_title("Cosine similarity", fontsize=12, pad=15)
g.fig.colorbar(sm, cax=cbar_ax, orientation='vertical')

# layout 
g.fig.subplots_adjust(left=0.05, right=0.95, top=0.93, bottom=0.1)
plt.show()

g.fig.savefig(f"{directory}/clustermap.pdf", format="pdf", bbox_inches='tight')
g.fig.savefig(f"{directory}/clustermap.svg", format="svg", bbox_inches='tight')

### 6.4 Top genes in GEPs

In [None]:
consensus_programs = {}
for cluster_id, programs in cluster_dict.items():
    if len(programs) < 2:
        continue  

    # Mean
    subset = spectra_matrix[programs]
    consensus = subset.mean(axis=1)
    consensus_programs[f"consensus_{cluster_id}"] = consensus

In [None]:
top_genes_dict = {}
for name, consensus in consensus_programs.items():
    # Order genes
    top_genes = consensus.sort_values(ascending=False).head(100).index.tolist()
    top_genes_dict[name] = top_genes

# Trasform in DataFrame 
consensus_top_genes_df = pd.DataFrame.from_dict(
    top_genes_dict, orient="index", columns=[f"gene_{i+1}" for i in range(100)]
)
consensus_top_genes_df = consensus_top_genes_df.T.copy()

In [None]:
# map ENSG -> GeneSymbol
gene_map = adata.var["GeneSymbol"].to_dict()

top_genes_dict = {}
for name, consensus in consensus_programs.items():
    # top 100
    top_ensg = consensus.sort_values(ascending=False).head(100).index.tolist()

    # if symbol not found, use ENSG
    top_symbols = [gene_map.get(ensg, ensg) for ensg in top_ensg]
    top_genes_dict[name] = top_symbols

consensus_top_genes_df = pd.DataFrame.from_dict(
    top_genes_dict, orient="index", columns=[f"gene_{i+1}" for i in range(100)]
)

consensus_top_genes_df.T.head(20)

In [None]:
# save
consensus_top_genes_df.T.to_csv(f"{directory}/cos_similarity_consensus_top_genes_df.csv")

In [None]:
# map ENSG in symbols
consensus_programs_symbols = {} # for gsea

for consensus, series in consensus_programs.items():
    renamed_series = series.rename(index=lambda ensg: gene_map.get(ensg, ensg))
    consensus_programs_symbols[consensus] = renamed_series

In [None]:
# save
with open(f"{directory}/consensus_programs_symbols.pkl", "wb") as f:
    pickle.dump(consensus_programs_symbols, f)

## 7. Show programs usage

In [None]:
all_usage = []  # DataFrame list
density_threshold = 0.1
name = 'cNMF_2000hvg'
directory = os.path.join(output_directory, "2000hvg")

for cohort in cohorts:
    labels = []
    cohort_dir = os.path.join(output_directory, cohort)
    with open(os.path.join(cohort_dir, name, "selected_K.txt")) as f:
        selected_K = int(f.read().strip())
        
    usage_path = os.path.join(cohort_dir, name,
                                f"{name}.usages.k_{selected_K}.dt_{str(f'{density_threshold}').replace('.', '_')}.consensus.txt")

    df = pd.read_csv(usage_path, sep="\t", index_col=0) # cells x geps
    for i in range(df.shape[1]):
        labels.append(f"{cohort}_k{i}")
    df.columns = labels
    df_normalized = df.div(df.sum(axis=1), axis=0)
    all_usage.append(df_normalized)

In [None]:
print(f"Number of DataFrames: {len(all_usage)}")

for i, df in enumerate(all_usage):
    print(f"DataFrame {i}: shape = {df.shape}")


In [None]:
consensus_to_gep = defaultdict(list)
for k, v in cluster_dict.items():
    new_key = f"consensus_{int(k)}"
    consensus_to_gep[new_key] = v

# Invert dict: GEP -> consensus
gep_to_consensus = {}
for consensus, geps in consensus_to_gep.items():
    for gep in geps:
        gep_to_consensus[gep] = consensus

# List of the dataframes cells x consensus
aggregated_dfs = []

for df in all_usage:
    valid_cols = df.columns
    df_filtered = df[valid_cols].copy()
    # Rename columns with consensus ID
    df_filtered.columns = [gep_to_consensus[col] for col in df_filtered.columns]
    # Sum columns with same consensus
    df_grouped = df_filtered.groupby(df_filtered.columns, axis=1).sum()
    aggregated_dfs.append(df_grouped)

# Merge in a dataframe
usage_consensus = pd.concat(aggregated_dfs, axis=0)
# Sum columns with same consensus
usage_consensus_grouped = usage_consensus.groupby(usage_consensus.columns, axis=1).sum()

# Order according consensus ID
usage_consensus_grouped = usage_consensus_grouped.reindex(
    sorted(usage_consensus_grouped.columns, key=lambda x: int(x.split('_')[1])),
    axis=1
)

# Row sum to 1
usage_consensus_grouped_normalized = usage_consensus_grouped.div(usage_consensus_grouped.sum(axis=1), axis=0)

### 7.1 Cell type usage

#### 7.1.1 Fine labels

In [None]:
usage_norm_forplot = usage_consensus_grouped_normalized.unstack().reset_index()
usage_norm_forplot = pd.merge(left=usage_norm_forplot, right=adata.obs[['cell_type_fine', 'dataset']], left_on='level_1', right_index=True)
usage_norm_forplot.columns = ['GEP', 'cell', 'Usage', 'Cell Type', 'Donor']

cluster_order = adata.obs['cell_type_fine'].unique().tolist()

(fig,axes) = plt.subplots(len(cluster_order),1, figsize=(20,20), dpi=200, gridspec_kw={'hspace':0.8})
plt.rcParams.update({'font.size': 14})

for i,k in enumerate(cluster_order):
    g = usage_norm_forplot.loc[usage_norm_forplot['Cell Type'] == k, :]
    sns.boxplot(x='GEP', y='Usage', hue='Donor', ax=axes[i], data=g, fliersize=1)
    axes[i].set_title(k, fontsize=16)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].set_ylim(0, 1)
    axes[i].tick_params(axis='y', labelsize=10)
    axes[i].legend(bbox_to_anchor=(1,1))
    if i != (len(cluster_order)-1):
        axes[i].set_xticklabels([])
    else:
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right', fontsize=14)
        
    if i != 0:
        axes[i].legend().remove()
    else:
        axes[i].legend(bbox_to_anchor=(1,1))

fig.text(0.5, 0.02, 'Consensus Program', ha='center', fontsize=16)
fig.text(0.06, 0.5, 'Normalized Usage', va='center', rotation='vertical', fontsize=16)

# Save
fig.savefig(f"{directory}/usage_by_celltype_fine.pdf", format="pdf", bbox_inches="tight")
fig.savefig(f"{directory}/usage_by_celltype_fine.svg", format="svg", bbox_inches="tight")

#### 7.1.2 Middle labels

In [None]:
usage_norm_forplot_middle = usage_consensus_grouped_normalized.unstack().reset_index()
usage_norm_forplot_middle = pd.merge(left=usage_norm_forplot_middle, right=adata.obs[['cell_type_middle', 'dataset']], left_on='level_1', right_index=True)
usage_norm_forplot_middle.columns = ['GEP', 'cell', 'Usage', 'Cell Type', 'Donor']

cluster_order = adata.obs['cell_type_middle'].unique().tolist()

(fig,axes) = plt.subplots(len(cluster_order),1, figsize=(20,10), dpi=200, gridspec_kw={'hspace':.8})
for i,k in enumerate(cluster_order):
    g = usage_norm_forplot_middle.loc[usage_norm_forplot_middle['Cell Type'] == k, :]
    sns.boxplot(x='GEP', y='Usage', hue='Donor', ax=axes[i], data=g, fliersize=1)
    axes[i].set_title(k)
    axes[i].legend(bbox_to_anchor=(1,1))
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    if i != (len(cluster_order)-1):
        axes[i].set_xticklabels([])
    else:
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
        
    if i != 0:
        axes[i].legend().remove()
    else:
        axes[i].legend(bbox_to_anchor=(1,1))

### 7.2 Cumulative usage

In [None]:
total_usage = usage_consensus_grouped_normalized.sum(axis=0)

# order according descending usage
total_usage_sorted = total_usage.sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=total_usage_sorted.index, y=total_usage_sorted.values, palette='viridis')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Total normalized usage across all cells')
plt.xlabel('Consensus programs')
plt.title('Cumulative usage of consensus programs')

plt.tight_layout()
plt.show()

### 7.3 UMAP

In [None]:
# order cells as in adata
consensus_df_aligned = usage_consensus_grouped_normalized.loc[adata.obs_names]

# add consensus in .obs
for col in consensus_df_aligned.columns:
    adata.obs[col] = consensus_df_aligned[col]

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [None]:
adata.layers["logcounts"] = adata.X.copy()

In [None]:
sc.tl.pca(adata, random_state=1)

In [None]:
sc.pp.neighbors(adata, n_neighbors=30, random_state=1)

In [None]:
sc.tl.umap(adata, min_dist=0.1, spread=1, random_state=1)

In [None]:
sc.pl.umap(adata, color=['dataset','cell_type_fine'], wspace=0.5)

In [None]:
sc.pl.umap(adata, color=consensus_df_aligned.columns)