In [4]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning, module="pertpy")

import numpy as np
import pandas as pd
import pertpy as pt
import scanpy as sc

path = './cca_output'

In [5]:
adata = pt.dt.dialogue_example()
adata

AnnData object with n_obs × n_vars = 5374 × 6329
    obs: 'nCount_RNA', 'nFeature_RNA', 'cellQ', 'gender', 'location', 'clinical.status', 'cell.subtypes', 'pathology', 'origin', 'subset', 'sample', 'path_str'
    var: 'name'

Parse out the `sample` variable used in the DIALOGUE example, found here: https://github.com/livnatje/DIALOGUE/wiki/Example

In [6]:
adata.obs['dlg_sample'] = ['_'.join(x.split('.')[:-1]) for x in adata.obs.index]
adata.obs['dlg_sample'] = adata.obs['dlg_sample'].astype('category')

Compute DIALOGUE with matching parameters.

In [7]:
# get PCA components from the R object
pcas = []
for ct in ['A', 'B', 'C']:
    pcas.append(
        pd.read_csv(f'{path}/X_pca_{ct}.csv', index_col=0)
    )
adata.obsm['X_pca'] = pd.concat(pcas).loc[adata.obs.index].values

In [8]:
# ensure that every cell type is represented in every sample, filter out samples which are missing one
isecs = pd.crosstab(adata.obs["subset"], adata.obs["dlg_sample"])
adata = adata[adata.obs['dlg_sample'].isin(list(isecs.columns[isecs.all(axis=0)]))]

In [9]:
dl = pt.tl.Dialogue(
    sample_id="dlg_sample",
    celltype_key="subset",
    n_counts_key="nCount_RNA",
    n_mpcs=3,
)
adata, mcps, ws, ct_subs = dl.calculate_multifactor_PMD(
    adata, ct_order=['A', 'B', 'C'], normalize=True, n_components=30)

In [10]:
mcps

{'A': array([[-5.7669056 ,  6.8349498 , -5.04845148],
        [ 2.08005192,  6.96710363, -1.3403043 ],
        [ 4.21167163,  4.21897857,  1.40072115],
        ...,
        [-2.70325964, -2.7417633 , -2.33825855],
        [-3.59714067, -1.77806464, -6.28650686],
        [-4.6564212 , -2.30038976, -0.84388061]]),
 'B': array([[ 3.11130825,  0.69955729,  0.85266283],
        [ 2.54079313,  3.04932612,  3.98226391],
        [ 0.21602093,  1.48820717,  5.04206781],
        ...,
        [ 3.98506992, -2.59069026, -0.58767398],
        [-1.86507028,  0.44118853,  0.98714811],
        [-0.09791248, -1.40341592, -2.00320109]]),
 'C': array([[ 0.26233966,  3.52190007,  7.06018598],
        [ 2.09766818,  3.22548099,  2.72945513],
        [-0.39920822,  5.80544514,  6.392281  ],
        ...,
        [ 0.2515091 , -3.87723146, -3.8069468 ],
        [-0.53177246, -3.87148126, -4.10433737],
        [-0.7606454 , -1.9749182 , -2.32549458]])}

Verify that the MCP scores match.

In [11]:
from scipy.stats import pearsonr

In [12]:
r2s = []
for ct in ['A', 'B', 'C']:
    print('cell type', ct)
    orig_mcps = pd.read_csv(f'{path}/{ct}_cca_scores.csv', index_col=0)
    for i in [0, 1, 2]:
        r2 = pearsonr(
            mcps[ct].T[i],
            orig_mcps.loc[adata[adata.obs.subset == ct].obs.index][f'MCP{i+1}'].values  # match on barcode
        )
        print(r2)
        r2s.append(r2[0])

cell type A
PearsonRResult(statistic=0.9682998608107244, pvalue=0.0)
PearsonRResult(statistic=0.9275317696781453, pvalue=0.0)
PearsonRResult(statistic=0.960676822624977, pvalue=0.0)
cell type B
PearsonRResult(statistic=0.988343185699627, pvalue=0.0)
PearsonRResult(statistic=0.9403670983030641, pvalue=0.0)
PearsonRResult(statistic=0.9016949114728543, pvalue=0.0)
cell type C
PearsonRResult(statistic=0.9790323905196536, pvalue=0.0)
PearsonRResult(statistic=0.9649246065949145, pvalue=0.0)
PearsonRResult(statistic=0.9661366587676552, pvalue=0.0)


In [15]:
np.array(r2s).mean()

0.9552230338301797