## Generate cellphoneDB input files for making dotplots (infected vs uninfected)

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os
import sys
sys.executable

'/opt/conda/bin/python'

In [2]:
def grouped_obs_percent(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        X.data = X.data > 0.01
        perc = np.asarray(np.sum(X,axis=0)/X.shape[0]).reshape(-1)
        out[group] = [round(i, 2) for i in perc ]
    return out

def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

In [3]:
adata = sc.read ('../../data_integration/results/scVI/rna8_scVIintegrated_latent30_All_20230707.h5ad')

In [4]:
#keep good quality cells
sc.pp.filter_cells(adata, min_genes=2000)
# remove lowly expressed genes
sc.pp.filter_genes(adata, min_counts=3)

  res = method(*args, **kwargs)


In [5]:
adata

AnnData object with n_obs × n_vars = 165128 × 32505
    obs: 'sample', 'stage', 'hpi', 'infection', 'percent_mito', 'n_counts', 'sample_barcode', 'assignment_SoC', 'donor_id', 'scrublet_score', 'scrublet_cluster_score', 'zscore', 'bh_pval', 'bonf_pval', 'S_score', 'G2M_score', 'phase', 'n_genes_by_counts', 'total_counts', 'total_counts_hs', 'pct_counts_hs', 'total_counts_tg', 'pct_counts_tg', 'Tg_infected', 'n_genes', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'leiden_scvi', 'scrublet_doublet', 'identity_2022', 'cell_type', 'souporcell_MFgenotype', 'MFgenotype', 'cell_type_broad', 'umap_density_Tg_infected', 'stage_perInfection', 'celltype-Stage', 'Tg_intracellular', 'celltype-Intracellular', 'celltype_predictions', 'probabilities', 'Dev_Stage'
    var: 'gene_ids', 'feature_types', 'mean-0', 'std-0', 'mean-1', 'std-1', 'mean-2', 'std-2', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'n_counts'
    uns: 'Dev_Stage_color

In [6]:
adata.obs['donor_id'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Hrv124,10440,0.063224
Hrv135,9140,0.055351
Hrv136,12686,0.076825
Hrv168,15012,0.090911
Hrv225,508,0.003076
Hrv232,8369,0.050682
Hrv236,48259,0.292252
scDonor_Tg1,13387,0.08107
scDonor_Tg2,15450,0.093564
scDonor_Tg3,14111,0.085455


In [7]:
adata.obs['stage_perInfection'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Lm_24h,15740,0.09532
Lm_48h,12013,0.07275
Pf_24h,26298,0.159258
Pf_48h,2572,0.015576
Tg_24h,16635,0.10074
Tg_48h,12750,0.077213
UI_Lm_24h,12952,0.078436
UI_Lm_48h,10578,0.064059
UI_Pf_24h,21090,0.127719
UI_Pf_48h,3171,0.019203


In [8]:
adata =adata[[ i in ['24h'] for i in adata.obs.hpi ]]

  res = method(*args, **kwargs)


In [9]:
print(adata.X[3:10,3:10])

  (0, 0)	3.0
  (0, 1)	1.0
  (1, 0)	3.0
  (2, 0)	19.0
  (2, 1)	2.0
  (2, 2)	2.0
  (3, 0)	9.0
  (3, 1)	5.0
  (3, 3)	1.0
  (3, 5)	1.0
  (4, 0)	2.0
  (5, 0)	10.0
  (5, 5)	1.0


In [10]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

Trying to set attribute `.obs` of view, copying.


In [11]:
adata.write ('../data/lognormalised_24h.h5ad')

In [12]:
# Add cluster subname such that this new name will match with DEG cluster name
adata.obs['cell_type']= adata.obs.cell_type.astype('string')+'.'+adata.obs.stage_perInfection.astype('string')

In [13]:
#Compute % percent and average files - for making dot plots between infected vs control uninfected
percent = grouped_obs_percent(adata, 'cell_type')
pd.DataFrame(percent).to_csv('../data/inf_percent_24h.csv')

means = grouped_obs_mean(adata, 'cell_type')
pd.DataFrame(means).to_csv('../data/inf_average_log_24h.csv')

  res = method(*args, **kwargs)


## Malaria

In [14]:
adata = sc.read('../../diffGeneExpression/macrophages/data/malaria_singleCell_24h.h5ad')

In [15]:
adata.obs['cell_type'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
EVT_1,298,0.005914
EVT_2,771,0.015302
Endo_f,5781,0.114732
F,9138,0.181356
F_p,246,0.004882
F_sm,689,0.013674
HBC,10025,0.19896
HBC_p,8,0.000159
PAMM1,2603,0.05166
PV,5649,0.112112


In [16]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

  res = method(*args, **kwargs)


In [17]:
# Add cluster subname such that this new name will match with DEG cluster name
adata.obs['cell_type']= adata.obs.cell_type.astype('string')+'.'+adata.obs.stage_perInfection.astype('string')

In [18]:
#Compute % percent and average files - for making dot plots between infected vs control uninfected
percent = grouped_obs_percent(adata, 'cell_type')
pd.DataFrame(percent).to_csv('../data/malaria_inf_percent_24h.csv')

means = grouped_obs_mean(adata, 'cell_type')
pd.DataFrame(means).to_csv('../data/malaria_inf_average_log_24h.csv')

  res = method(*args, **kwargs)


In [19]:
adata.obs['infection'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Pf,27971,0.555123
UI,22416,0.444877


## Listeria

In [20]:
adata = sc.read('../../diffGeneExpression/macrophages/data/listeria_singleCell_24h.h5ad')

In [21]:
adata.obs['cell_type'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
EVT_1,107,0.003658
EVT_2,730,0.024958
Endo_f,4833,0.165236
F,4987,0.170502
F_p,91,0.003111
F_sm,108,0.003692
HBC,7953,0.271907
HBC_p,5,0.000171
PAMM1,1976,0.067558
PV,3093,0.105747


In [22]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

  res = method(*args, **kwargs)


In [23]:
# Add cluster subname such that this new name will match with DEG cluster name
adata.obs['cell_type']= adata.obs.cell_type.astype('string')+'.'+adata.obs.stage_perInfection.astype('string')

In [24]:
#Compute % percent and average files - for making dot plots between infected vs control uninfected
percent = grouped_obs_percent(adata, 'cell_type')
pd.DataFrame(percent).to_csv('../data/listeria_inf_percent_24h.csv')

means = grouped_obs_mean(adata, 'cell_type')
pd.DataFrame(means).to_csv('../data/listeria_inf_average_log_24h.csv')

  res = method(*args, **kwargs)


In [25]:
adata.obs['infection'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Lm,16050,0.548737
UI,13199,0.451263


## Toxoplasma

In [26]:
adata = sc.read('../../diffGeneExpression/macrophages/data/toxoplasma_singleCell_24h.h5ad')

In [27]:
adata.obs['cell_type'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
EVT_1,140,0.007858
EVT_2,437,0.024527
Endo_f,1375,0.077173
F,2273,0.127575
F_p,72,0.004041
F_sm,405,0.022731
HBC,6926,0.38873
HBC_p,90,0.005051
PAMM1,624,0.035023
PV,985,0.055284


In [28]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

  res = method(*args, **kwargs)


In [29]:
# Add cluster subname such that this new name will match with DEG cluster name
adata.obs['cell_type']= adata.obs.cell_type.astype('string')+'.'+adata.obs.stage_perInfection.astype('string')

In [30]:
#Compute % percent and average files - for making dot plots between infected vs control uninfected
percent = grouped_obs_percent(adata, 'cell_type')
pd.DataFrame(percent).to_csv('../data/toxoplasma_inf_percent_24h.csv')

means = grouped_obs_mean(adata, 'cell_type')
pd.DataFrame(means).to_csv('../data/toxoplasma_inf_average_log_24h.csv')

  res = method(*args, **kwargs)


In [31]:
adata.obs['infection'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Tg,9134,0.512656
UI,8683,0.487344
