# Preparing MFI atlas data for CellPhone

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import seaborn as sns
import scanpy as sc
from glob import iglob
import anndata
import os
#import sklearn
from sklearn.linear_model import LogisticRegression
import matplotlib as mpl
import scipy
import matplotlib.pyplot as plt
import pickle

In [2]:
np.random.seed(0)

In [3]:
%%bash

pip freeze

absl-py==0.12.0
aiohttp==3.7.4.post0
airr==1.3.1
alabaster==0.7.12
alembic @ file:///home/conda/feedstock_root/build_artifacts/alembic_1613901514078/work
anndata @ file:///home/conda/feedstock_root/build_artifacts/anndata_1605539061264/work
anndata2ri==1.0.6
annoy @ file:///home/conda/feedstock_root/build_artifacts/python-annoy_1610271511811/work
anyio @ file:///home/conda/feedstock_root/build_artifacts/anyio_1614388751160/work/dist
arboreto==0.1.6
argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1610522574055/work
async-generator==1.10
async-timeout==3.0.1
attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1605083924122/work
Babel @ file:///home/conda/feedstock_root/build_artifacts/babel_1605182336601/work
backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
backports.functools-lru-cache==1.6.1
bbknn @ file:///opt/conda/conda-bld/bbknn_1616434096000/work
bleach @ file:///home/conda/feedstock_root/build_artifac

In [4]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures



-----
anndata     0.7.5
scanpy      1.7.1
sinfo       0.3.1
-----
PIL                 8.1.2
anndata             0.7.5
anyio               NA
attr                20.3.0
babel               2.9.0
backcall            0.2.0
brotli              NA
cairo               1.20.0
certifi             2021.10.08
cffi                1.14.5
chardet             4.0.0
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                2021.03.1
dateutil            2.8.1
decorator           4.4.2
fsspec              0.8.7
get_version         2.1
google              NA
h5py                3.1.0
idna                2.10
igraph              0.8.3
ipykernel           5.5.0
ipython_genutils    0.2.0
ipywidgets          7.6.3
jedi                0.18.0
jinja2              2.11.3
joblib              1.0.1
json5               NA
jsonschema          3.2.0
jupyter_server      1.4.1
jupyterlab_server   2.3.0
kiwisolver          1.3.1


In [5]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202111_MFI_sc_sn_new_scVI_analysis/'

## Reading raw filtered data

In [6]:
# raw count values for filtered droplets and filtered genes (afte initial QC)
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')

In [7]:
# read in final annotations
final_annot = pd.read_csv(save_path + 'final_annotation_all_cells_and_nulcei_20211123.csv', index_col=0)
final_annot['cell_type'].value_counts()

dS2               65746
dS1               42586
SCT               28177
VCT               23472
uSMC              17043
dEpi_secretory    16834
dM1               14099
dS3               13197
dNK2              12487
HOFB              10733
dNK1              10450
fF1               10420
PV                 9980
dT_cells           9740
dM2                9221
Endo_M             6877
VCT_p              6679
EVT_1              5289
NK                 5187
iEVT               3676
VCT_CCC            3373
MO                 3050
dNK3               2952
ILC3               2511
EVT_2              2358
T_cells            2126
VCT_fusing         1971
dT_regs            1862
Endo_L             1761
Endo_F             1466
fF2                1396
M3                 1299
B_cells             774
DC                  697
dDC                 694
Plasma              255
Granulocytes        195
dEpi_lumenal        135
eEVT                 28
GC                   19
Name: cell_type, dtype: int64

In [8]:
len(final_annot)

350815

In [9]:
# save them - already done, can skip
#final_annot.to_csv('/lustre/scratch117/cellgen/team292/aa22/with_Luz/202111_MFI_CellPhone/meta_updated_20211123.tsv',sep='\t')

In [9]:
# subset adata to only final cells + nuclei and transfer annotations
adata = adata[final_annot.index,:]
adata.obs['cell_type'] = final_annot.loc[adata.obs_names,'cell_type']

  res = method(*args, **kwargs)
Trying to set attribute `.obs` of view, copying.


In [10]:
# generating a table of M/F origin assignments of cells and nuclei for Matthew Young
adata

AnnData object with n_obs × n_vars = 350815 × 30800
    obs: 'n_genes', 'sample', 'technology', 'tissue', 'dev_age', 'donor', 'dataset', 'run', 'number_of_individuals_multiplexed', 'batch', 'percent_mito', 'n_counts', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'is_doublet', 'cell_type'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-14', 'feature_types-14', 'genome-14', 'n_cells-14', 'gene_ids-15', 'feature_types-15', 'genome-15', 'n_cells-15', 'gene_ids-16', 'feature_types-16', 'genome-16', 'n_cells-16', 'gene_ids-17', 'feature_types-17', 'genome-17', 'n_cells-17', 'gene_ids-18', 'feature_types-18', 'genome-18', 'n_cells-18', 'gene_ids-19', 'fe

In [10]:
# normalise count data and save
# cellphone works best on normalised counts
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

normalizing by total count per cell
    finished (0:00:16): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [12]:
adata.write('/lustre/scratch117/cellgen/team292/aa22/with_Luz/202111_MFI_CellPhone/20211123_adata_MFI_normalised_counts.h5ad')


... storing 'cell_type' as categorical


In [7]:
adata = sc.read('/lustre/scratch117/cellgen/team292/aa22/with_Luz/202111_MFI_CellPhone/20211123_adata_MFI_normalised_counts.h5ad')


# Preparing a table of expression proportions - for parsing cellphone results later

Matrix of genes (rows) per celltypes (columns) containing the proportion [0-1] of cells in a celltype expressing the gene

In [11]:
# using the downsampled data here
np.unique(adata.obs['cell_type'], return_counts=True)

(array(['B_cells', 'DC', 'EVT_1', 'EVT_2', 'Endo_F', 'Endo_L', 'Endo_M',
        'GC', 'Granulocytes', 'HOFB', 'ILC3', 'M3', 'MO', 'NK', 'PV',
        'Plasma', 'SCT', 'T_cells', 'VCT', 'VCT_CCC', 'VCT_fusing',
        'VCT_p', 'dDC', 'dEpi_lumenal', 'dEpi_secretory', 'dM1', 'dM2',
        'dNK1', 'dNK2', 'dNK3', 'dS1', 'dS2', 'dS3', 'dT_cells', 'dT_regs',
        'eEVT', 'fF1', 'fF2', 'iEVT', 'uSMC'], dtype=object),
 array([  774,   697,  5289,  2358,  1466,  1761,  6877,    19,   195,
        10733,  2511,  1299,  3050,  5187,  9980,   255, 28177,  2126,
        23472,  3373,  1971,  6679,   694,   135, 16834, 14099,  9221,
        10450, 12487,  2952, 42586, 65746, 13197,  9740,  1862,    28,
        10420,  1396,  3676, 17043]))

In [13]:
# normalised count values
t = adata.X.toarray().T
# Set cell ids as column index and gene ids as row index
df_expr_matrix = pd.DataFrame(data=t, columns= adata.obs.index, index=adata.var_names)

In [14]:
df_expr_matrix

barcode_sample,AAACGGGCATTGGCGC-1_FCA7167219,AAACGGGTCGCGATCG-1_FCA7167219,AAAGATGAGCAATATG-1_FCA7167219,AAAGATGAGTTCGCGC-1_FCA7167219,AAAGATGCATGTCGAT-1_FCA7167219,AAAGATGGTCTCGTTC-1_FCA7167219,AAAGCAATCATAACCG-1_FCA7167219,AAATGCCTCAAGGTAA-1_FCA7167219,AAATGCCTCCCTTGCA-1_FCA7167219,AACACGTGTAGATTAG-1_FCA7167219,...,TTTGTGTTCAGCTAAC-1_Pla_Camb10714920,TTTGTGTTCAGGATGA-1_Pla_Camb10714920,TTTGTGTTCCGTTATT-1_Pla_Camb10714920,TTTGTGTTCGAAGTAG-1_Pla_Camb10714920,TTTGTGTTCGCTCACT-1_Pla_Camb10714920,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,TTTGTTGGTCACAGCG-1_Pla_Camb10714920,TTTGTTGGTTTACTTG-1_Pla_Camb10714920,TTTGTTGGTTTGAGCA-1_Pla_Camb10714920
A1BG,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,1.051414,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A1BG-AS1,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A1CF,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A2M,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,5.042017,0.00000,0.000000,0.000000,0.000000,1.913143,0.000000,0.000000,0.000000,0.0
A2M-AS1,0.0,0.000000,2.246181,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,4.05515,0.000000,4.800768,7.010165,0.000000,0.000000,2.571025,0.000000,0.0
ZYG11A,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
ZYG11B,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,1.984915,0.000000,0.000000,3.826287,1.150814,0.000000,0.000000,0.0
ZYX,0.0,5.673759,0.000000,1.4518,0.0,3.61729,0.0,0.0,2.102828,0.0,...,1.680672,0.00000,0.000000,0.000000,0.000000,1.913143,0.000000,0.000000,0.498058,0.0


In [15]:
df_expr_matrix_per_cell_type = {}

for ct in np.unique(adata.obs['cell_type']):
    print(ct)
    curr_subset_of_barcodes = list(adata[adata.obs['cell_type'] == ct].obs_names)
    df_expr_matrix_per_cell_type[ct] = df_expr_matrix.loc[:,curr_subset_of_barcodes]
    print(len(curr_subset_of_barcodes), 'cells of this cell type')
    print('subsetted a table of shape', df_expr_matrix_per_cell_type[ct].shape, '\n')

B_cells


  res = method(*args, **kwargs)


774 cells of this cell type
subsetted a table of shape (30800, 774) 

DC
697 cells of this cell type
subsetted a table of shape (30800, 697) 

EVT_1
5289 cells of this cell type
subsetted a table of shape (30800, 5289) 

EVT_2
2358 cells of this cell type
subsetted a table of shape (30800, 2358) 

Endo_F
1466 cells of this cell type
subsetted a table of shape (30800, 1466) 

Endo_L
1761 cells of this cell type
subsetted a table of shape (30800, 1761) 

Endo_M
6877 cells of this cell type
subsetted a table of shape (30800, 6877) 

GC
19 cells of this cell type
subsetted a table of shape (30800, 19) 

Granulocytes
195 cells of this cell type
subsetted a table of shape (30800, 195) 

HOFB
10733 cells of this cell type
subsetted a table of shape (30800, 10733) 

ILC3
2511 cells of this cell type
subsetted a table of shape (30800, 2511) 

M3
1299 cells of this cell type
subsetted a table of shape (30800, 1299) 

MO
3050 cells of this cell type
subsetted a table of shape (30800, 3050) 

NK
5

In [18]:
# how many non-zero elements does this row vector contain? in proportion
df_expr_matrix_per_cell_type['DC'].astype(bool).sum(axis=1)/df_expr_matrix_per_cell_type['DC'].shape[1]

A1BG        0.197991
A1BG-AS1    0.034433
A1CF        0.000000
A2M         0.271162
A2M-AS1     0.012912
              ...   
ZXDC        0.080344
ZYG11A      0.001435
ZYG11B      0.091822
ZYX         0.615495
ZZEF1       0.139168
Length: 30800, dtype: float64

In [20]:
df_percentage_expressed = pd.DataFrame(index = df_expr_matrix.index,
                                      columns=np.unique(adata.obs['cell_type']))

In [21]:
for col in df_percentage_expressed.columns:
    print(col)
    df_percentage_expressed.loc[:,col] = df_expr_matrix_per_cell_type[col].astype(bool).sum(axis=1)/df_expr_matrix_per_cell_type[col].shape[1]

B_cells
DC
EVT_1
EVT_2
Endo_F
Endo_L
Endo_M
GC
Granulocytes
HOFB
ILC3
M3
MO
NK
PV
Plasma
SCT
T_cells
VCT
VCT_CCC
VCT_fusing
VCT_p
dDC
dEpi_lumenal
dEpi_secretory
dM1
dM2
dNK1
dNK2
dNK3
dS1
dS2
dS3
dT_cells
dT_regs
eEVT
fF1
fF2
iEVT
uSMC


In [22]:
df_percentage_expressed

Unnamed: 0,B_cells,DC,EVT_1,EVT_2,Endo_F,Endo_L,Endo_M,GC,Granulocytes,HOFB,...,dS1,dS2,dS3,dT_cells,dT_regs,eEVT,fF1,fF2,iEVT,uSMC
A1BG,0.121447,0.197991,0.168652,0.077608,0.098226,0.107325,0.078523,0.000000,0.158974,0.025249,...,0.214977,0.244182,0.570205,0.116222,0.107411,0.071429,0.213340,0.254298,0.035909,0.046823
A1BG-AS1,0.018088,0.034433,0.059747,0.041137,0.027285,0.039750,0.019921,0.000000,0.020513,0.007360,...,0.059597,0.085937,0.229370,0.014168,0.016112,0.107143,0.049808,0.049427,0.031828,0.046823
A1CF,0.000000,0.000000,0.000000,0.001272,0.001364,0.003975,0.003199,0.000000,0.000000,0.000280,...,0.000657,0.000350,0.000303,0.000000,0.000000,0.000000,0.000480,0.002865,0.002176,0.001408
A2M,0.089147,0.271162,0.036302,0.049618,0.645975,0.624077,0.771848,0.105263,0.117949,0.623311,...,0.456558,0.447084,0.543911,0.037269,0.049409,0.142857,0.101536,0.437679,0.081338,0.425219
A2M-AS1,0.002584,0.012912,0.001513,0.000424,0.011596,0.007382,0.007998,0.000000,0.061538,0.007826,...,0.010003,0.005871,0.014094,0.017967,0.010741,0.000000,0.002879,0.007163,0.000816,0.005809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0.054264,0.080344,0.081301,0.102629,0.193724,0.197615,0.164025,0.052632,0.046154,0.152613,...,0.126943,0.107839,0.108206,0.025873,0.026316,0.285714,0.084357,0.100287,0.220620,0.239512
ZYG11A,0.001292,0.001435,0.015504,0.010178,0.004093,0.015900,0.005671,0.000000,0.000000,0.002702,...,0.007021,0.003042,0.000909,0.000308,0.001611,0.035714,0.014875,0.006447,0.016594,0.014141
ZYG11B,0.054264,0.091822,0.207223,0.178117,0.278990,0.250426,0.218700,0.105263,0.051282,0.213733,...,0.200770,0.220774,0.367887,0.030595,0.041890,0.500000,0.255854,0.284384,0.273667,0.298422
ZYX,0.102067,0.615495,0.519380,0.291349,0.347203,0.334469,0.356115,0.263158,0.138462,0.345849,...,0.525337,0.497840,0.693794,0.152053,0.201933,0.357143,0.247313,0.268625,0.287541,0.348178


In [23]:
df_expr_matrix

barcode_sample,AAACGGGCATTGGCGC-1_FCA7167219,AAACGGGTCGCGATCG-1_FCA7167219,AAAGATGAGCAATATG-1_FCA7167219,AAAGATGAGTTCGCGC-1_FCA7167219,AAAGATGCATGTCGAT-1_FCA7167219,AAAGATGGTCTCGTTC-1_FCA7167219,AAAGCAATCATAACCG-1_FCA7167219,AAATGCCTCAAGGTAA-1_FCA7167219,AAATGCCTCCCTTGCA-1_FCA7167219,AACACGTGTAGATTAG-1_FCA7167219,...,TTTGTGTTCAGCTAAC-1_Pla_Camb10714920,TTTGTGTTCAGGATGA-1_Pla_Camb10714920,TTTGTGTTCCGTTATT-1_Pla_Camb10714920,TTTGTGTTCGAAGTAG-1_Pla_Camb10714920,TTTGTGTTCGCTCACT-1_Pla_Camb10714920,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920,TTTGTTGGTCACAGCG-1_Pla_Camb10714920,TTTGTTGGTTTACTTG-1_Pla_Camb10714920,TTTGTTGGTTTGAGCA-1_Pla_Camb10714920
A1BG,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,1.051414,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A1BG-AS1,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A1CF,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A2M,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,5.042017,0.00000,0.000000,0.000000,0.000000,1.913143,0.000000,0.000000,0.000000,0.0
A2M-AS1,0.0,0.000000,2.246181,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,4.05515,0.000000,4.800768,7.010165,0.000000,0.000000,2.571025,0.000000,0.0
ZYG11A,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
ZYG11B,0.0,0.000000,0.000000,0.0000,0.0,0.00000,0.0,0.0,0.000000,0.0,...,0.000000,0.00000,1.984915,0.000000,0.000000,3.826287,1.150814,0.000000,0.000000,0.0
ZYX,0.0,5.673759,0.000000,1.4518,0.0,3.61729,0.0,0.0,2.102828,0.0,...,1.680672,0.00000,0.000000,0.000000,0.000000,1.913143,0.000000,0.000000,0.498058,0.0


In [24]:
# save 
df_percentage_expressed.to_csv('/lustre/scratch117/cellgen/team292/aa22/with_Luz/202111_MFI_CellPhone/20211123_PercentExpressed_for_cellphone.csv')