In [1]:
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os

data_type = 'float32'

sys.path.insert(1, '/nfs/team205/vk7/sanger_projects/BayraktarLab/cell2location/')
import cell2location

import matplotlib as mpl
from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn as sns

# silence scanpy that prints a lot of warnings
import warnings
warnings.filterwarnings('ignore')

Get subset of highly expressed genes used for cell2location:

In [2]:
reg_path = "/lustre/scratch117/cellgen/team283/Kidney-Nanostring/scrna/regression_model/RegressionGeneBackgroundCoverageTorch_46covariates_40268cells_10882genes/"
adata_snrna_raw = sc.read(f'{reg_path}sc.h5ad')

In [3]:
# Column name containing cell type annotations
covariate_col_names = 'celltype'

# Extract a pd.DataFrame with signatures from anndata object
inf_aver = adata_snrna_raw.raw.var.copy()
inf_aver = inf_aver.loc[:, [f'means_cov_effect_{covariate_col_names}_{i}' for i in adata_snrna_raw.obs[covariate_col_names].unique()]]
from re import sub
inf_aver.columns = [sub(f'means_cov_effect_{covariate_col_names}_{i}', '', i) for i in adata_snrna_raw.obs[covariate_col_names].unique()]
inf_aver = inf_aver.iloc[:, inf_aver.columns.argsort()]

# normalise by average experiment scaling factor (corrects for sequencing depth)
inf_aver = inf_aver * adata_snrna_raw.uns['regression_mod']['post_sample_means']['sample_scaling'].mean()

In [4]:
gene_subset = inf_aver.index

Use arithmetic mean rather the c2l regression model to get expression signatures:

In [5]:
sc_data_folder = '/lustre/scratch117/cellgen/team283/Kidney-Nanostring/scrna/'
results_folder = '/nfs/team283/aa16/data/Kidney-Nanostring/scrna/regression_model/'

In [6]:
## snRNA reference (raw counts)
adata_snrna_raw = anndata.read_h5ad(sc_data_folder + "mature_adata.h5ad")

# replace gene symbols with ENSEMBL gene ids
adata_snrna_raw.var['SYMBOL'] = adata_snrna_raw.var_names
adata_snrna_raw.var_names = adata_snrna_raw.var['ID']
adata_snrna_raw.var_names.name = None

In [7]:
adata_snrna_raw.X = adata_snrna_raw.layers['counts']

In [8]:
adata_snrna_raw

AnnData object with n_obs × n_vars = 40268 × 33694
    obs: 'Barcode', 'Short_Sample', 'Project', 'Experiment', 'celltype', 'compartment', 'broad_celltype'
    var: 'ID', 'Symbol', 'SYMBOL'
    obsm: 'X_umap'
    layers: 'corrected_counts', 'counts'

In [9]:
adata_snrna_raw = adata_snrna_raw[:,gene_subset]

In [10]:
adata_snrna_raw.var_names = adata_snrna_raw.var['SYMBOL']

In [11]:
adata_snrna_raw.raw = adata_snrna_raw

In [12]:
aver = cell2location.cluster_averages.cluster_averages.get_cluster_averages(adata_snrna_raw, 'celltype')

In [13]:
aver

Unnamed: 0_level_0,Ascending vasa recta endothelium,B cell,CD4 T cell,CD8 T cell,Connecting tubule,Descending vasa recta endothelium,Distinct proximal tubule 1,Distinct proximal tubule 2,Epithelial progenitor cell,Fibroblast,...,Peritubular capillary endothelium 2,Plasmacytoid dendritic cell,Podocyte,Principal cell,Proliferating Proximal Tubule,Proximal tubule,Thick ascending limb of Loop of Henle,Transitional urothelium,Type A intercalated cell,Type B intercalated cell
SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FO538757.2,0.160000,0.092308,0.075975,0.086781,1.904459,0.130208,0.384615,0.072848,0.567227,0.304348,...,0.145946,0.391304,1.507937,0.852273,0.071839,0.039422,0.056296,0.200000,0.327273,0.43200
AP006222.2,0.063333,0.046154,0.031622,0.037336,0.343949,0.093750,0.184615,0.066225,0.222689,0.173913,...,0.113513,0.173913,0.261905,0.181818,0.043103,0.031204,0.056296,0.177778,0.141818,0.16000
SAMD11,0.030000,0.002051,0.002053,0.001009,0.286624,0.005208,0.123077,0.000000,0.126050,0.173913,...,0.000000,0.000000,1.309524,0.227273,0.002874,0.000982,0.000000,0.022222,0.021818,0.01600
NOC2L,0.123333,0.103590,0.087474,0.083754,0.636942,0.177083,0.476923,0.086093,0.403361,0.217391,...,0.243243,0.260870,0.380952,0.363636,0.054598,0.033640,0.031111,0.066667,0.345454,0.29600
PLEKHN1,0.010000,0.004103,0.010678,0.004036,0.012739,0.000000,0.076923,0.013245,0.105042,0.043478,...,0.000000,0.000000,0.023810,0.011364,0.000000,0.000218,0.000000,0.044444,0.003636,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND6,0.746666,0.181539,0.168377,0.152371,3.605096,0.833333,2.969232,0.887417,2.537816,0.391304,...,1.567568,1.913043,0.825397,2.931818,0.681035,0.610190,0.127407,0.355556,2.479998,3.70400
MT-CYB,16.646688,8.846169,5.924083,5.547921,167.292999,19.130213,76.276917,22.178801,99.033638,23.608694,...,30.740572,24.913044,30.642849,89.090904,38.669533,19.791323,3.321491,54.577782,123.709106,104.64798
AC011043.1,0.013333,0.003077,0.004928,0.003027,0.898089,0.026042,0.292308,0.019868,0.218487,0.217391,...,0.010811,0.000000,0.190476,0.227273,0.043103,0.016875,0.026667,0.000000,0.167273,0.12000
AC007325.4,0.030000,0.003077,0.008214,0.004036,0.216561,0.078125,0.138462,0.066225,0.201681,0.000000,...,0.108108,0.043478,0.103175,0.045455,0.025862,0.046077,0.090370,0.022222,0.109091,0.02400


In [14]:
aver.to_csv('/nfs/team283/aa16/data/BenS_cellProfiles.csv')