# How Do I Match Markers To Their Cells?

In [1]:
import sklearn as sk
import anndata as ad
import scanpy as sc 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import joblib

sc.settings.n_jobs = -1

In [2]:
adata = ad.read_h5ad('data/camr_modeling_input.h5ad')
gene_names = adata.var["feature_name"].astype(str)

In [3]:
top_features_log_reg = pd.read_csv('spreadsheets/ovr_top_20_genes_by_cell_type_reproduction.csv')
top_features_log_reg_pos = top_features_log_reg[top_features_log_reg['Coefficient'] > 0]
top_features_log_reg_pos.index = top_features_log_reg_pos.Gene
top_features_log_reg_pos

Unnamed: 0_level_0,Cell Type,Gene,Coefficient
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Synpr,AC,Synpr,0.593647
Asic2,AC,Asic2,0.561311
Frmd5,AC,Frmd5,0.557747
Slc32a1,AC,Slc32a1,0.551331
Gad1,AC,Gad1,0.517364
...,...,...,...
Epb41,Rod,Epb41,0.272485
Pde6a,Rod,Pde6a,0.272482
Dmd,Rod,Dmd,0.270124
Tmem108,Rod,Tmem108,0.264541


In [4]:
# Average normalized expression
feature_expression_pd = pd.DataFrame(adata.X.toarray(), columns = gene_names.tolist())
feature_expression_pd["majorclass"] = adata.obs["majorclass"].tolist()
feature_expression_pd_mean = feature_expression_pd.groupby("majorclass").agg("mean")

In [6]:
# highly_variable = adata.raw.var['feature_name'].isin(adata.var['feature_name'])
raw_feature_expression_pd_mean = pd.read_csv('spreadsheets/raw_mean_variable_genes.csv', index_col=0)

## Filter

In [7]:
in_regression = adata.var["feature_name"].astype(str).isin(top_features_log_reg_pos["Gene"])
long_enough = adata.var["feature_length"].astype(int) >= 960 # It's a conservative filter

keep_genes = long_enough & in_regression
kept_gene_names = gene_names[keep_genes].tolist()
print(len(kept_gene_names), kept_gene_names) # 218 genes still

218 ['6330411D24Rik', 'Abca8a', 'Abhd2', 'Acsl3', 'Adgrl4', 'Aldoc', 'Anks1b', 'Apoe', 'Aqp4', 'Arhgap15', 'Arhgap42', 'Arl15', 'Arr3', 'Asic2', 'Atf3', 'Atp1a2', 'B3galt2', 'Bsg', 'C1qa', 'C1qb', 'C1qc', 'C1ql1', 'Calb1', 'Calb2', 'Cald1', 'Cartpt', 'Ccn1', 'Ccn2', 'Cd52', 'Cd53', 'Cd74', 'Cdh11', 'Cdh18', 'Cdr2', 'Chgb', 'Cldn5', 'Clstn2', 'Clu', 'Cmss1', 'Cnga1', 'Cngb3', 'Col8a1', 'Col9a1', 'Coro1a', 'Creb5', 'Crip1', 'Csmd3', 'Csrp1', 'Ctla2a', 'Ctsd', 'Ctss', 'Ctsz', 'Cxcl14', 'D130079A08Rik', 'Dbi', 'Dkk3', 'Dlc1', 'Dlgap1', 'Dmd', 'Dnajb1', 'Dock4', 'Ebf1', 'Eftud2', 'Egfem1', 'Egr1', 'Epb41', 'Epha6', 'Flt1', 'Fos', 'Fosb', 'Frmd5', 'Frmpd4', 'Gabra1', 'Gabrr2', 'Gad1', 'Galnt18', 'Galntl6', 'Gfap', 'Glra1', 'Glul', 'Gm20754', 'Gm32442', 'Gnat2', 'Gnb3', 'Gng5', 'Gngt1', 'Gngt2', 'Gpr37', 'Gria4', 'Grik1', 'Grik2', 'Grm5', 'Grm6', 'Grm8', 'Gsta4', 'Hexb', 'Hmgn1', 'Hs6st3', 'Hspa1a', 'Id1', 'Igfbp7', 'Il1rapl1', 'Il33', 'Itm2a', 'Junb', 'Kcnd2', 'Kcne2', 'Kctd16', 'Klf2', 'Lap

In [8]:
# count_limit = 0.1 # Absolute detection limit
count_lowcluster = 4
count_highcluster = 100

detectable_genes = (raw_feature_expression_pd_mean >= count_lowcluster).sum(axis=0) >= 1
optical_crowding_genes = (raw_feature_expression_pd_mean >= count_highcluster).sum(axis=0) > 0

is_expression_candidate = detectable_genes & (~optical_crowding_genes)

gene_names.index = is_expression_candidate.index # necessary step apparently
expression_candidates = gene_names[is_expression_candidate].tolist()
print(len(expression_candidates), expression_candidates) # 428 genes

428 ['4930447C04Rik', 'Abca8a', 'Abcb1a', 'Abcc9', 'Abcg2', 'Acsl3', 'Actb', 'Adamtsl1', 'Adarb2', 'Adgrl4', 'Aipl1', 'Aldoc', 'Anks1b', 'Aqp4', 'Arhgap15', 'Arhgap31', 'Arl15', 'Arr3', 'Asic2', 'Atf3', 'Atp10a', 'Atp1a2', 'Atp1b1', 'Atp2b1', 'Atrnl1', 'B2m', 'Basp1', 'Bsg', 'Btg2', 'C130073E24Rik', 'C1qa', 'C1qb', 'C1qc', 'C1ql1', 'Cabp5', 'Cacna2d1', 'Cacna2d3', 'Cacnb2', 'Calb1', 'Calb2', 'Cald1', 'Car14', 'Car2', 'Cartpt', 'Ccdc141', 'Ccl12', 'Ccl3', 'Ccl4', 'Ccn1', 'Cd63', 'Cd74', 'Cd81', 'Cd9', 'Cdh18', 'Cdk14', 'Cdkn1a', 'Cebpd', 'Cfh', 'Chgb', 'Cldn10', 'Cldn5', 'Clic4', 'Clu', 'Cmss1', 'Cngb3', 'Cnn3', 'Cntn4', 'Cntn5', 'Cntnap2', 'Col23a1', 'Col8a1', 'Col9a1', 'Cox4i2', 'Cox8b', 'Cp', 'Crb1', 'Creb5', 'Crim1', 'Crip1', 'Crip2', 'Cryab', 'Crym', 'Csf1r', 'Csrp1', 'Ctla2a', 'Ctsd', 'Ctss', 'Ctsz', 'Cx3cr1', 'Cxcl12', 'Cyba', 'Dach1', 'Dapl1', 'Dbi', 'Dct', 'Dgkb', 'Dkk3', 'Dlc1', 'Dlgap1', 'Dmd', 'Dnajb1', 'Dock4', 'Dscam', 'Dscaml1', 'Dusp1', 'Ebf1', 'Eftud2', 'Egfem1', 'Egr1'

In [9]:
final_candidates = np.intersect1d(expression_candidates, kept_gene_names)
print(len(final_candidates), final_candidates) # 173 genes

173 ['Abca8a' 'Acsl3' 'Adgrl4' 'Aldoc' 'Anks1b' 'Aqp4' 'Arhgap15' 'Arl15'
 'Arr3' 'Asic2' 'Atf3' 'Atp1a2' 'Bsg' 'C1qa' 'C1qb' 'C1qc' 'C1ql1' 'Calb1'
 'Calb2' 'Cald1' 'Cartpt' 'Ccn1' 'Cd74' 'Cdh18' 'Chgb' 'Cldn5' 'Clu'
 'Cmss1' 'Cngb3' 'Col8a1' 'Col9a1' 'Creb5' 'Crip1' 'Csrp1' 'Ctla2a' 'Ctsd'
 'Ctss' 'Ctsz' 'Dbi' 'Dkk3' 'Dlc1' 'Dlgap1' 'Dmd' 'Dnajb1' 'Dock4' 'Ebf1'
 'Eftud2' 'Egfem1' 'Egr1' 'Epb41' 'Flt1' 'Fos' 'Fosb' 'Frmd5' 'Frmpd4'
 'Galnt18' 'Galntl6' 'Gfap' 'Glul' 'Gm20754' 'Gm32442' 'Gnat2' 'Gnb3'
 'Gng5' 'Gngt1' 'Gngt2' 'Gpr37' 'Gria4' 'Grik1' 'Grm5' 'Hexb' 'Hmgn1'
 'Hs6st3' 'Hspa1a' 'Id1' 'Igfbp7' 'Il1rapl1' 'Il33' 'Itm2a' 'Junb' 'Kcnd2'
 'Kctd16' 'Klf2' 'Laptm5' 'Lrfn2' 'Ly6a' 'Ly6c1' 'Ly86' 'Marchf1' 'Mecom'
 'Megf11' 'Mfge8' 'Mgp' 'Mlc1' 'Mt3' 'Myl9' 'Mylk' 'Nckap5' 'Ndrg1'
 'Ndufa4l2' 'Nebl' 'Nefl' 'Nefm' 'Nrn1' 'Opn1mw' 'Opn1sw' 'Pals2' 'Pcdh15'
 'Pcdh9' 'Pcp4' 'Pde3a' 'Pde6c' 'Pde6g' 'Pde6h' 'Pdgfrb' 'Plcb1' 'Plcl1'
 'Pltp' 'Ppp1r15a' 'Prdx6' 'Prkg1' 'Ptprb' 'Ptprk' 'Ptprm

## Match Gene to Cell by Regression Origin

In [46]:
marker_to_cell = top_features_log_reg_pos.loc[final_candidates.tolist()[:22] + final_candidates.tolist()[23:]] # Cd74 not here???
marker_to_cell = marker_to_cell.sort_values('Cell Type')
marker_to_cell

Unnamed: 0,Cell Type,Gene,Coefficient
56,AC,Frmd5,0.557747
130,AC,Ptprk,0.389845
18,AC,C1ql1,0.441690
128,AC,Prkg1,0.348293
103,AC,Nckap5,0.364320
...,...,...,...
107,Rod,Nefl,0.368925
109,Rod,Nefm,0.291039
120,Rod,Pde6g,0.260567
29,Rod,Cmss1,0.544045


In [47]:
marker_to_cell.to_csv('spreadsheets/majorclass_matching_regression_sensitive.csv')

## Match Gene to Cell by Max Raw Expression

In [52]:
raw_feature_expression_pd_mean[final_candidates].idxmax().sort_values().to_csv('spreadsheets/majorclass_matching_raw_sensitive.csv')

Previous, but one step at a time

In [57]:
feature_expression_pd_mean[final_candidates]

Unnamed: 0_level_0,Abca8a,Acsl3,Adgrl4,Aldoc,Anks1b,Aqp4,Arhgap15,Arl15,Arr3,Asic2,...,Trpm1,Ttyh1,Utrn,Vcl,Vim,Vtn,Vwc2,Wwtr1,Zfp804b,Zswim6
majorclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AC,0.001757,0.213868,0.000459,0.707288,1.290151,0.007209,0.056967,0.278053,0.018036,1.862911,...,0.011852,0.159402,0.184185,0.030723,0.019784,0.006225,0.266861,0.007082,0.13079,0.169734
Astrocyte,0.03219,0.295942,0.000174,2.915245,0.076975,0.014318,0.012014,0.226003,0.007978,0.126728,...,0.005684,0.434535,0.185642,1.537174,1.31819,0.057783,0.010822,0.968706,0.072854,1.022675
BC,0.003442,0.141044,0.000385,0.555694,0.365289,0.008587,0.118854,0.226542,0.004,0.721724,...,1.585373,1.057017,0.041472,0.046724,0.002497,0.004846,0.037726,0.001378,2.645283,0.033238
Cone,0.002382,0.472915,0.001108,0.384925,0.012393,0.001459,0.001167,0.475651,3.269826,0.011055,...,0.012538,0.60911,0.002084,0.096838,0.005645,1.00549,0.001886,0.041378,0.022923,0.14009
Endothelial,0.015475,0.109504,1.879265,0.037845,0.015184,0.002801,0.020438,1.797341,0.011568,0.068536,...,0.028287,0.031312,1.638214,0.766303,0.985518,0.946504,0.004481,0.812678,0.052171,0.593766
HC,0.003905,0.207566,0.0,0.940799,1.73008,0.006518,1.796901,0.333649,0.010651,0.400141,...,0.0,0.174357,1.657383,0.001924,1.895057,0.017481,1.225136,0.0,0.016755,0.390676
MG,1.615902,2.092352,0.001171,1.326876,0.020443,1.409581,0.003521,0.29168,0.009806,0.035057,...,0.054212,1.152544,0.379727,0.32495,1.354838,0.044211,0.00352,0.30002,0.943655,0.511716
Microglia,0.009387,0.03431,0.0,0.444454,0.021339,0.006794,0.363914,0.295934,0.008774,0.037735,...,0.007883,0.03248,0.060542,0.077744,0.095845,0.049134,0.00838,0.035542,0.010477,0.778234
Pericyte,0.057426,0.094924,0.01132,0.060428,0.056436,0.000754,0.065203,0.229881,0.024272,0.226256,...,0.080812,0.040915,1.773055,1.034753,0.528396,3.338531,0.007196,0.504892,0.084926,0.293976
RGC,0.000678,0.326206,0.0003,1.052668,0.704993,0.001665,0.006029,0.190931,0.000846,1.391859,...,0.00187,0.148508,0.187978,0.064205,0.004872,0.001247,0.124912,0.003813,0.795493,0.172287


In [58]:
feature_expression_pd_mean[final_candidates].idxmax()

Abca8a              MG
Acsl3               MG
Adgrl4     Endothelial
Aldoc        Astrocyte
Anks1b              HC
              ...     
Vtn           Pericyte
Vwc2                HC
Wwtr1        Astrocyte
Zfp804b             BC
Zswim6       Astrocyte
Length: 173, dtype: object