**Author:** Elias Rafael Ruiz-Morales

**Institution:** Wellcome Sanger institute

**July, 2023**

---

## Generate cellphoneDB input files for making dotplots (infected vs uninfected)

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os
import sys
sys.executable

'/opt/conda/bin/python'

In [3]:
sc.logging.print_versions()



-----
anndata     0.7.5
scanpy      1.7.1
sinfo       0.3.1
-----
PIL                 8.1.2
anndata             0.7.5
anyio               NA
attr                20.3.0
babel               2.9.0
backcall            0.2.0
brotli              NA
cairo               1.20.0
certifi             2020.12.05
cffi                1.14.5
chardet             4.0.0
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                2021.03.1
dateutil            2.8.1
decorator           4.4.2
fsspec              0.8.7
get_version         2.1
google              NA
h5py                3.1.0
idna                2.10
igraph              0.8.3
ipykernel           5.5.0
ipython_genutils    0.2.0
jedi                0.18.0
jinja2              2.11.3
joblib              1.0.1
json5               NA
jsonschema          3.2.0
jupyter_server      1.4.1
jupyterlab_server   2.3.0
kiwisolver          1.3.1
legacy_api_wrap     0.0.0


In [2]:
def grouped_obs_percent(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        X.data = X.data > 0.01
        perc = np.asarray(np.sum(X,axis=0)/X.shape[0]).reshape(-1)
        out[group] = [round(i, 2) for i in perc ]
    return out

def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

## Loading data

In [13]:
adata = sc.read ('../../data_integration/results/scVI/rna8_scVIintegrated_latent30_All_20230707.h5ad')

In [14]:
adata

AnnData object with n_obs × n_vars = 113028 × 36601
    obs: 'sample', 'stage', 'hpi', 'infection', 'percent_mito', 'n_counts', 'sample_barcode', 'assignment_SoC', 'donor_id', 'scrublet_score', 'scrublet_cluster_score', 'zscore', 'bh_pval', 'bonf_pval', 'S_score', 'G2M_score', 'phase', 'n_genes_by_counts', 'total_counts', 'total_counts_hs', 'pct_counts_hs', 'total_counts_tg', 'pct_counts_tg', 'Tg_infected', 'n_genes', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'leiden_scvi', 'celltype_predictions', 'probabilities', 'scrublet_doublet', 'cell_type_2022', 'cell_type', 'souporcell_MFgenotype', 'MFgenotype', 'cell_type_broad', 'umap_density_Tg_infected', 'stage_perInfection', 'celltype-Stage', 'Tg_intracellular', 'celltype-Intracellular', 'Dev_Stage'
    var: 'gene_ids', 'feature_types', 'mean-0', 'std-0', 'mean-1', 'std-1', 'mean-2', 'std-2', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'Dev_Stage_colors', 'MFgeno

In [15]:
adata.obs['donor_id'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Hrv124,10731,0.094941
Hrv135,7325,0.064807
Hrv136,6047,0.0535
Hrv168,15769,0.139514
Hrv232,4809,0.042547
Hrv236,34929,0.30903
scDonor_Tg1,7957,0.070398
scDonor_Tg2,9468,0.083767
scDonor_Tg3,7071,0.06256
scDonor_Tg4,8922,0.078936


In [16]:
adata.obs['stage_perInfection'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Lm_24h,17778,0.157288
Pf_24h,27100,0.239764
Tg_24h,16806,0.148689
UI_Lm_24h,13057,0.11552
UI_Pf_24h,21675,0.191767
UI_Tg_24h,16612,0.146972


In [None]:
# Selecting 24h experiments only
adata =adata[[ i in ['24h'] for i in adata.obs.hpi ]]

In [18]:
print(adata.X[3:10,3:10])

  (3, 1)	1.0
  (3, 6)	3.0
  (4, 1)	4.0
  (4, 2)	3.0
  (4, 6)	1.0
  (5, 6)	1.0
  (6, 1)	1.0
  (6, 2)	1.0


In [19]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

Trying to set attribute `.obs` of view, copying.


In [20]:
# saving the lognormalised dataset
adata.write ('../data/lognormalised_24h.h5ad')

In [47]:
# Add cluster subname such that this new name will match with DEG cluster name
adata.obs['cell_type']= adata.obs.cell_type.astype('string')+'.'+adata.obs.stage_perInfection.astype('string')

In [49]:
#Compute % percent and average files - for making dot plots between infected vs control uninfected
percent = grouped_obs_percent(adata, 'cell_type')
pd.DataFrame(percent).to_csv('../data/inf_percent_24h.csv')

means = grouped_obs_mean(adata, 'cell_type')
pd.DataFrame(means).to_csv('../data/inf_average_log_24h.csv')

  res = method(*args, **kwargs)


In [46]:
adata= sc.read('../data/lognormalised_24h.h5ad')

In [48]:
adata.obs['cell_type']

Pla_HDBR13007974_AAACCCAAGCGTTGTT             F.UI_Tg_24h
Pla_HDBR13007974_AAACCCAAGTAGTCAA    VCT_fusing.UI_Tg_24h
Pla_HDBR13007974_AAACCCACAATGAACA           HBC.UI_Tg_24h
Pla_HDBR13007974_AAACCCACAGAGAGGG           HBC.UI_Tg_24h
Pla_HDBR13007974_AAACCCACAGTAGAAT           HBC.UI_Tg_24h
                                             ...         
Pla_HDBR13661572_TTTGTTGAGGTTCATC              HBC.Lm_24h
Pla_HDBR13661572_TTTGTTGCAAATTAGG           Endo_f.Lm_24h
Pla_HDBR13661572_TTTGTTGCAATAGGGC                F.Lm_24h
Pla_HDBR13661572_TTTGTTGGTGCTTATG           Endo_f.Lm_24h
Pla_HDBR13661572_TTTGTTGGTGGTAACG            PAMM1.Lm_24h
Name: cell_type, Length: 113028, dtype: string

# Malaria

In [23]:
# loading malaria-only data
adata = sc.read('../../data_integration/results/scVI/malaria_singleCell_24h.h5ad')

In [24]:
adata.obs['cell_type'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
EVT_1,301,0.006171
EVT_2,701,0.014372
Endo_f,5788,0.118667
F,9059,0.18573
F_p,566,0.011604
F_sm,483,0.009903
HBC,9000,0.184521
HBC_p,11,0.000226
PAMM1,2217,0.045454
PV,5606,0.114936


In [25]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

  res = method(*args, **kwargs)


In [26]:
# Add cluster subname such that this new name will match with DEG cluster name
adata.obs['cell_type']= adata.obs.cell_type.astype('string')+'.'+adata.obs.infection.astype('string')

In [27]:
#Compute % percent and average files - for making dot plots between infected vs control uninfected
percent = grouped_obs_percent(adata, 'cell_type')
pd.DataFrame(percent).to_csv('../data/malaria_inf_percent_24h.csv')

means = grouped_obs_mean(adata, 'cell_type')
pd.DataFrame(means).to_csv('../data/malaria_inf_average_log_24h.csv')

  res = method(*args, **kwargs)


In [29]:
adata.obs['infection'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Pf,27100,0.555613
UI,21675,0.444387


# Listeria

In [30]:
# loading listeria-only data
adata = sc.read('../../data_integration/results/scVI/listeria_singleCell_24h.h5ad')

In [31]:
adata.obs['cell_type'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
EVT_1,106,0.003438
EVT_2,602,0.019523
Endo_f,5714,0.185309
F,5017,0.162705
F_p,334,0.010832
F_sm,71,0.002303
HBC,7948,0.257759
HBC_p,18,0.000584
PAMM1,2055,0.066645
PV,3441,0.111594


In [32]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

  res = method(*args, **kwargs)


In [33]:
# Add cluster subname such that this new name will match with DEG cluster name
adata.obs['cell_type']= adata.obs.cell_type.astype('string')+'.'+adata.obs.infection.astype('string')

In [34]:
#Compute % percent and average files - for making dot plots between infected vs control uninfected
percent = grouped_obs_percent(adata, 'cell_type')
pd.DataFrame(percent).to_csv('../data/listeria_inf_percent_24h.csv')

means = grouped_obs_mean(adata, 'cell_type')
pd.DataFrame(means).to_csv('../data/listeria_inf_average_log_24h.csv')

  res = method(*args, **kwargs)


In [35]:
adata.obs['infection'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Lm,17778,0.576553
UI,13057,0.423447


# Toxoplasma

In [36]:
# loading toxoplasma-only data

adata = sc.read('../../data_integration/results/scVI/toxoplasma_singleCell_24h.h5ad')

In [37]:
adata.obs['cell_type'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
EVT_1,223,0.006673
EVT_2,606,0.018134
Endo_f,3520,0.105332
F,4956,0.148303
F_p,420,0.012568
F_sm,309,0.009247
HBC,11481,0.343557
HBC_p,120,0.003591
PAMM1,1402,0.041953
PV,2635,0.07885


In [38]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

  res = method(*args, **kwargs)


In [39]:
# Add cluster subname such that this new name will match with DEG cluster name
adata.obs['cell_type']= adata.obs.cell_type.astype('string')+'.'+adata.obs.infection.astype('string')

In [40]:
#Compute % percent and average files - for making dot plots between infected vs control uninfected
percent = grouped_obs_percent(adata, 'cell_type')
pd.DataFrame(percent).to_csv('../data/toxoplasma_inf_percent_24h.csv')

means = grouped_obs_mean(adata, 'cell_type')
pd.DataFrame(means).to_csv('../data/toxoplasma_inf_average_log_24h.csv')

  res = method(*args, **kwargs)


In [41]:
adata.obs['infection'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Tg,16806,0.502903
UI,16612,0.497097
