In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
# import multimil as mtm
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad

In [2]:
sc.set_figure_params(frameon=False, dpi=100)

In [4]:
# import cellrank as cr

In [86]:
adata = sc.read_h5ad('../../Human_Atlas_Harmonised_genes_filtered.h5ad')

In [6]:
t_cells = adata[adata.obs.Level_3.str.contains('T Cell')].copy()

In [10]:
t_cells

In [13]:
del t_cells.uns['Level_4_colors']

In [14]:
t_cells.obs.Level_4.value_counts()

In [17]:
sc.pl.umap(t_cells, color='Level_4')

In [20]:
sc.tl.pca(t_cells, layer='log_norm')

In [21]:
sc.pl.pca_variance_ratio(t_cells)

In [22]:
sc.pl.pca(t_cells, color='Technology')

In [18]:
sc.pp.neighbors(t_cells, use_rep='scanvi_extended_atlas_emb', n_neighbors=100, metric='cosine')
sc.tl.umap(t_cells)
sc.pl.umap(t_cells, color='Level_4')

In [None]:
t_cells.write('t_cells.h5ad')

# Reload

In [92]:
sc.set_figure_params(frameon=False, dpi=100, dpi_save=300)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)

In [4]:
t_cells = sc.read_h5ad('Doub_Pos_T_Cells/t_cells.h5ad')

In [33]:
t_cells.obs.columns

In [5]:
t_cells[t_cells.obs.Level_4.str.contains('Doub')].obs.TreatmentType.value_counts()

In [34]:
t_cells[t_cells.obs.Level_4.str.contains('Doub')].obs.Condition.value_counts()

In [5]:
sc.tl.rank_genes_groups(t_cells, groupby='Level_4', layer='log_norm')

In [None]:
pd.DataFrame(t_cells.uns['rank_genes_groups']['names']).to_csv('T_Cells_DGE.csv')

In [18]:
pd.DataFrame(t_cells.uns['rank_genes_groups']['names'])['Double Positive CD4+CD8+ T Cell'].head(100)

In [7]:
t_cells

In [8]:
genes = [
    "CD4",
    "CD8B",
    "IL7R",
    "CCR7",
    "TCF7",
    "LEF1",
    "TXNIP",
    "SESN3",
    "ATP5D",
    "LDHB"
]

In [9]:
t_cells.obs.Level_4.unique()

In [73]:
target_group = "Double Positive CD4+CD8+ T Cell"
genes_of_interest = ["CD4","CD8B","IL7R","CCR7","TCF7","LEF1","TXNIP","SESN3","ATP5D","LDHB"]

alpha = 0.05
l2fc_thresh = 0.65
clip_at = 5 
df = sc.get.rank_genes_groups_df(t_cells, group=target_group).rename(
    columns={"names":"gene", "pvals_adj":"pval_adj", "logfoldchanges":"logfc"}
)
df["selected"] = df["gene"].str.upper().isin([g.upper() for g in genes_of_interest])
df["log2fc"] = df["logfc"] / np.log(2)
df_plot = df[df["log2fc"].between(-clip_at, clip_at)].copy()
# but clipping alone is usually enough:
df_plot = df.copy()
df_plot["log2fc_clip"] = df_plot["log2fc"].clip(-clip_at, clip_at)
 
# y-axis and significance flag (use *unclipped* log2fc for significance)
df_plot["pval_adj"] = df_plot["pval_adj"].replace(0, np.nextafter(0, 1))
df_plot["neglog10_padj"] = -np.log10(df_plot["pval_adj"])
df_plot["sig"] = (df_plot["pval_adj"] < alpha) & (np.abs(df_plot["log2fc"]) >= l2fc_thresh)

In [76]:
df_plot = df_plot[df_plot.log2fc_clip > -5].copy()

In [77]:
# subsets
sig = df_plot[df_plot["sig"]]
sel = df_plot[df_plot["selected"]]

# plot
plt.figure(figsize=(9,7))
plt.scatter(df_plot["log2fc_clip"], df_plot["neglog10_padj"], s=8, alpha=0.25, label="All genes")
plt.scatter(sig["log2fc_clip"], sig["neglog10_padj"], s=12, alpha=0.8, label="Significant")
plt.scatter(sel["log2fc_clip"], sel["neglog10_padj"], s=40, alpha=0.95,
            label="Selected", edgecolor="black")

for _, r in sel.iterrows():
    plt.annotate(r["gene"], (r["log2fc_clip"], r["neglog10_padj"]),
                 xytext=(4,4), textcoords="offset points", fontsize=9)

plt.axhline(-np.log10(alpha), linestyle="--", linewidth=1)
plt.axvline(-l2fc_thresh, linestyle="--", linewidth=1)
plt.axvline( l2fc_thresh, linestyle="--", linewidth=1)

plt.xlabel(f"log2FoldChange ({target_group} vs rest)")
plt.xlim(-4, 4)
plt.ylabel(r"$-\log_{10}$(adj. p-value)")
plt.title("Volcano with clipped log2FC (selected genes highlighted)")
plt.legend(frameon=False, loc="upper left", bbox_to_anchor=(1.02, 1.0), borderaxespad=0)
plt.grid(visible=False)
plt.tight_layout()
plt.show()
plt.savefig('Doub_Pos_T_Cells/Volcano_Doub_T.png', dpi=300)

In [81]:
import os
os.chdir('Doub_Pos_T_Cells/')

In [93]:
sc.pl.matrixplot(t_cells, groupby='Level_4', var_names=genes, layer='log_norm', standard_scale='var', swap_axes=True, save='matrixplot.png')

In [94]:
sc.pl.dotplot(t_cells, groupby='Level_4', var_names=genes, layer='log_norm', standard_scale='var', swap_axes=True, save='dotplot.png')

# Macrophages

In [97]:
import os
os.makedirs('../CD3_Macrophages/')
os.chdir('../CD3_Macrophages/')

In [98]:
pwd

In [99]:
macro = adata[adata.obs.Level_3.str.contains('Macro')].copy()

In [100]:
macro.obs.Level_4.value_counts()

In [101]:
del macro.uns['Level_4_colors']

In [102]:
sc.pl.umap(macro, color='Level_4')

In [24]:
sc.pp.neighbors(macro, use_rep='scanvi_extended_atlas_emb', n_neighbors=100, metric='cosine')
sc.tl.umap(macro)
sc.pl.umap(macro, color='Level_4')

In [103]:
sc.tl.rank_genes_groups(macro, groupby='Level_4', layer='log_norm')

In [104]:
pd.DataFrame(macro.uns['rank_genes_groups']['names']).to_csv('Macro_DGE.csv')

In [107]:
pd.DataFrame(macro.uns['rank_genes_groups']['names'])['Macrophage - CD3+ TAM'].head(100)

In [108]:
macro.write('macrophages.h5ad')

In [125]:
genes_of_interest = [
    "CD3D",
    "CD3E",
    "CD3G",
    "TRAC",
    "TRBC1",
    "TRBC2",
    "GZMA",
    "CCL5",
    "NKG7"
]

In [148]:
target_group = "Macrophage - CD3+ TAM"
alpha = 0.05
l2fc_thresh = 1
clip_at = 10 
df = sc.get.rank_genes_groups_df(macro, group=target_group).rename(
    columns={"names":"gene", "pvals_adj":"pval_adj", "logfoldchanges":"logfc"}
)
df["selected"] = df["gene"].str.upper().isin([g.upper() for g in genes_of_interest])
df["log2fc"] = df["logfc"] / np.log(2)
# df_plot = df[df["log2fc"].between(-clip_at, clip_at)].copy()
# but clipping alone is usually enough:
df_plot = df.copy()
df_plot["log2fc_clip"] = df_plot["log2fc"].clip(-clip_at, clip_at)
 
# y-axis and significance flag (use *unclipped* log2fc for significance)
df_plot["pval_adj"] = df_plot["pval_adj"].replace(0, np.nextafter(0, 1))
df_plot["neglog10_padj"] = -np.log10(df_plot["pval_adj"])
df_plot["sig"] = (df_plot["pval_adj"] < alpha) & (np.abs(df_plot["log2fc"]) >= l2fc_thresh)

In [114]:
pd.set_option('display.max_rows', 10)

In [149]:
df_plot = df_plot[df_plot.log2fc_clip > -7.5].copy()

In [150]:
df_plot[df_plot.selected]

In [151]:
# subsets
sig = df_plot[df_plot["sig"]]
sel = df_plot[df_plot["selected"]]

# plot
plt.figure(figsize=(9,7))
plt.scatter(df_plot["log2fc_clip"], df_plot["neglog10_padj"], s=8, alpha=0.25, label="All genes")
plt.scatter(sig["log2fc_clip"], sig["neglog10_padj"], s=12, alpha=0.8, label="Significant")
plt.scatter(sel["log2fc_clip"], sel["neglog10_padj"], s=40, alpha=0.95,
            label="Selected", edgecolor="black")

for _, r in sel.iterrows():
    plt.annotate(r["gene"], (r["log2fc_clip"], r["neglog10_padj"]),
                 xytext=(4,4), textcoords="offset points", fontsize=9)

plt.axhline(-np.log10(alpha), linestyle="--", linewidth=1)
plt.axvline(-l2fc_thresh, linestyle="--", linewidth=1)
plt.axvline( l2fc_thresh, linestyle="--", linewidth=1)

plt.xlabel(f"log2FoldChange ({target_group} vs rest)")
plt.xlim(-10, 10)
plt.ylabel(r"$-\log_{10}$(adj. p-value)")
plt.title("Volcano with clipped log2FC (selected genes highlighted)")
plt.legend(frameon=False, loc="upper left", bbox_to_anchor=(1.02, 1.0), borderaxespad=0)
plt.grid(visible=False)
plt.tight_layout()
plt.show()
plt.savefig('Volcano_CD3_Macrophages.png', dpi=300)

In [152]:
sc.pl.matrixplot(macro, groupby='Level_4', var_names=genes_of_interest, layer='log_norm', standard_scale='var', swap_axes=True, save='matrixplot.png')

In [153]:
sc.pl.dotplot(macro, groupby='Level_4', var_names=genes_of_interest, layer='log_norm', standard_scale='var', swap_axes=True, save='dotplot.png')