# Marker Determination

In [None]:
import sklearn as sk
import anndata as ad
import scanpy as sc 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import joblib

sc.settings.n_jobs = -1

os.chdir("/project/hipaa_ycheng11lab/atlas/CAMR2024/")

number_of_features = 20

sc.plotting.DotPlot.DEFAULT_SAVE_PREFIX = "figures/3_dotplot_"
sc.plotting.DotPlot.DEFAULT_LARGEST_DOT = 200.0

In [None]:
adata = ad.read_h5ad('data/2_camr_modeling_input.h5ad')
gene_names = adata.var["feature_name"].astype(str)
highly_variable = adata.raw.var['feature_name'].isin(gene_names.tolist())
adata

## Major Class Markers

In [None]:
analysis_name = 'majorclass'

In [None]:
top_features_log_reg = pd.read_csv(f'spreadsheets/2_ovr_LogReg_{analysis_name}_AbsTop{number_of_features}Markers.txt', sep = '\t')
# Only positive features, negative markers are less useful
top_features_log_reg_pos = top_features_log_reg[top_features_log_reg['Coefficient'] > 0]
top_features_log_reg_pos.index = top_features_log_reg_pos.Gene
top_features_log_reg_pos

In [None]:
raw_mean_expression = pd.read_csv(f'spreadsheets/raw_meanExpression_{analysis_name}.txt', sep = '\t', index_col=0)
raw_mean_expression

In [None]:
# Filter based on innate features of the gene itself
in_regression = adata.var["feature_name"].astype(str).isin(top_features_log_reg_pos["Gene"])
long_enough = adata.var["feature_length"].astype(int) >= 960 # It's a conservative filter

keep_genes = long_enough & in_regression
kept_gene_names = gene_names[keep_genes.tolist()].tolist()
print(len(kept_gene_names), kept_gene_names) # 216 genes

In [None]:
# Filter based on the filtering criteria
# adata.obs.library_platform.unique() shows that this is mix of 4 chemistries, so use strictest?

# count_limit = 0.1 # Absolute detection limit
count_lowcluster = 4
count_highcluster = 100

detectable_genes = (raw_mean_expression >= count_lowcluster).sum(axis=0) >= 1
optical_crowding_genes = (raw_mean_expression > count_highcluster).sum(axis=0) > 0

is_expression_candidate = detectable_genes & (~optical_crowding_genes)

expression_candidates = gene_names[is_expression_candidate.tolist()].tolist()
print(len(expression_candidates), expression_candidates) # 428 genes

In [None]:
# Combine gene metadata filtering and gene expression filtering together
final_candidates = np.intersect1d(expression_candidates, kept_gene_names)
print(len(final_candidates), final_candidates) # 172 genes

In [None]:
# Save all genes that passed these thresholds
ordered_features = top_features_log_reg_pos.loc[final_candidates.tolist()].sort_values(['Major_Name', 'Name'])
ordered_features.to_csv(f'spreadsheets/3_ovr_LogReg_{analysis_name}_xeniumFiltered.txt', sep ='\t', index=False)
ordered_features

In [None]:
sc.pl.dotplot(adata,
              var_names = ordered_features.index,
              gene_symbols="feature_name",
              groupby = 'majorclass',
              categories_order = adata.obs["majorclass"].cat.categories.sort_values(),
              vmax = count_lowcluster,
              vmin = count_lowcluster / 2,
              save = f"mouseRetina_{analysis_name}_xeniumFiltered." +
                     f"{count_lowcluster}-{count_highcluster}.pdf")

## Subtype Markers

In [None]:
analysis_name = 'minorclass'

In [None]:
top_features_log_reg_sub = pd.read_csv(f'spreadsheets/2_ovr_LogReg_{analysis_name}_AbsTop{number_of_features}Markers.txt', sep = '\t')
top_features_log_reg_pos_sub = top_features_log_reg_sub[top_features_log_reg_sub['Coefficient'] > 0]
top_features_log_reg_pos_sub.index = top_features_log_reg_pos_sub.Gene
top_features_log_reg_pos_sub

In [None]:
raw_mean_minorclass_expression = pd.read_csv(f'spreadsheets/raw_meanExpression_{analysis_name}.txt', sep = '\t', index_col=0)
raw_mean_minorclass_expression

In [None]:
# Filter based on innate features of the gene itself
in_regression = adata.var["feature_name"].astype(str).isin(top_features_log_reg_pos_sub["Gene"])
long_enough = adata.var["feature_length"].astype(int) >= 960 # It's a conservative filter

keep_genes = long_enough & in_regression
kept_gene_names = gene_names[keep_genes.tolist()].tolist()
print(len(kept_gene_names), kept_gene_names) # 680 genes

In [None]:
# Filter based on the filtering criteria
# adata.obs.library_platform.unique() # mix of 4 chemistries...

# count_limit = 0.1 # Absolute detection limit
count_lowcluster = 4
count_highcluster = 100

detectable_genes = (raw_mean_minorclass_expression >= count_lowcluster).sum(axis=0) >= 1
optical_crowding_genes = (raw_mean_minorclass_expression > count_highcluster).sum(axis=0) > 0

is_expression_candidate = detectable_genes & (~optical_crowding_genes)

expression_candidates = gene_names[is_expression_candidate.tolist()].tolist()
print(len(expression_candidates), expression_candidates) # 669 genes

In [None]:
final_candidates = np.intersect1d(expression_candidates, kept_gene_names)
print(len(final_candidates), final_candidates) # 479 genes

In [None]:
ordered_features_sub = top_features_log_reg_pos_sub.loc[final_candidates.tolist()].sort_values(['Major_Name', 'Name'])
ordered_features_sub.to_csv(f'spreadsheets/3_ovr_LogReg_{analysis_name}_xeniumFiltered.txt', sep ='\t', index=False)
ordered_features_sub

In [None]:
# Memory cleanup for plotting

del top_features_log_reg_sub, raw_mean_minorclass_expression, in_regression, long_enough, keep_genes, kept_gene_names, detectable_genes, optical_crowding_genes, is_expression_candidate, expression_candidates, final_candidates

adata.raw = None
adata.obs = adata.obs.loc[:, ["author_cell_type"]]
adata.var = adata.var.loc[:, ["gene_symbols", "feature_name"]]

import gc
import ctypes
gc.collect() # Free memory
libc = ctypes.CDLL("libc.so.6") # clearing cache 
libc.malloc_trim(0)

In [None]:
# Asking for lots of memory
sc.pl.dotplot(adata,
              ordered_features_sub.index.unique(),
              gene_symbols="feature_name",
              groupby = 'author_cell_type',
              vmax = count_lowcluster,
              vmin = count_lowcluster / 2,
              figsize = (40, 2),
              save = f"mouseRetina_{analysis_name}_xeniumFiltered." +
                     f"{count_lowcluster}-{count_highcluster}.png")

### Subtype Marker Better Plots

Requirements:

* adata
* final_candidates_ordered
* final_candidates_ordered_sub
* count_lowcluster
* count_highcluster

In [27]:
# Run this if starting from scratch!

import sklearn as sk
import anndata as ad
import scanpy as sc 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import joblib

sc.settings.n_jobs = -1

count_lowcluster = 4
count_highcluster = 100
analysis_name = 'minorclass'

adata = ad.read_h5ad('data/2_camr_modeling_input.h5ad')
adata.var.index = adata.var["feature_name"] # subset on genes instead of booleans

final_candidates_ordered = pd.read_csv(f'spreadsheets/3_ovr_LogReg_majorclass_xeniumFiltered.txt', sep = '\t')
final_candidates_ordered_sub = pd.read_csv(f'spreadsheets/3_ovr_LogReg_minorclass_xeniumFiltered.txt', sep = '\t')

In [None]:
majorclass = final_candidates_ordered["Name"].unique()
minorclass = final_candidates_ordered_sub["Name"].unique() # Not necessary
print(majorclass, minorclass)

In [28]:
subtype_to_type = pd.read_csv(f'spreadsheets/conversion_tables/2_minorToMajorClass.txt', sep = '\t', index_col = 0)
subtype_to_type

  subtype_to_type = adata.obs[["author_cell_type", "majorclass"]].groupby("author_cell_type").head(1)
  subtype_to_type.groupby("majorclass").agg("count").to_csv('spreadsheets/number_of_subtypes.csv')


Unnamed: 0,author_cell_type,majorclass
10x3_Ms_WT_P14_AAACCCAAGGGATCTG-1,RBC,BC
10x3_Ms_WT_P14_AAACCCAGTAGCTAAA-1,Rod,Rod
10x3_Ms_WT_P14_AAACCCATCGGCCCAA-1,MG,MG
10x3_Ms_WT_P14_AAACGAAAGACTCTAC-1,BC6,BC
10x3_Ms_WT_P14_AAACGAACAATGTTGC-1,BC5A,BC
...,...,...
rgc_T8_MPTK_2-CATCGTCGTACAAGCG-1,AC_54,AC
MouseACS2_CCGTGGAGTGCAACGA-1,AC_53,AC
MouseACS5_GGTGTTACAGGTCCAC-1,AC_62,AC
CTRLC57AllOther1_GCATGTACAGTCGTGC-1,Astrocyte,Astrocyte


In [None]:
adata.raw = None
adata.obs = adata.obs.loc[:, ["majorclass", "author_cell_type"]]
adata.var = adata.var.loc[:, ["gene_symbols", "feature_name"]]

adata.var.index = adata.var["feature_name"] # subset on genes instead of booleans
adata.var_names_make_unique()

import gc
import ctypes
gc.collect() # Free memory
libc = ctypes.CDLL("libc.so.6") # clearing cache 
libc.malloc_trim(0)

In [None]:
for cell in majorclass:

    cell_markers = final_candidates_ordered[final_candidates_ordered == cell].index
    subtypes = subtype_to_type.loc[subtype_to_type.majorclass == cell, "author_cell_type"].tolist()
    subtype_markers = final_candidates_ordered_sub[final_candidates_ordered_sub.isin(subtypes)].index

    markers = cell_markers.tolist() + subtype_markers.tolist()

    # sc.pl.dotplot throws a fit if there are duplicates
    unique_markers = []
    for m in markers:
        if m not in unique_markers:
            unique_markers += [m]

    sc.pl.dotplot(adata[adata.obs.majorclass == cell, unique_markers],
                  unique_markers,
                  gene_symbols="feature_name",
                  groupby = 'author_cell_type',
                  vmax = count_lowcluster * 3,
                  vmin = count_lowcluster - 1,
                  show = False,
                  save = f"mouseRetina_{analysis_name}-{cell}_xeniumFiltered." +
                         f"{count_lowcluster}-{count_highcluster}.pdf")

# Scratch