# DE analysis for CellPhnoDB

### With `limma`

MFI atlas: all donors invading trophoblast

Here using cells/nuclei assignment as a categorical covariate as it is the biggest source of gene coverage variation

**The DE gene lists obtained here are further used in CellPhoneDB analysis and TF analysis**

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import seaborn as sns
import scanpy as sc
from glob import iglob
import anndata
import os
#import sklearn
from sklearn.linear_model import LogisticRegression
import matplotlib as mpl
import scipy
import matplotlib.pyplot as plt
import pickle

In [2]:
np.random.seed(0)

In [3]:
%%bash

pip freeze

absl-py==0.12.0
aiohttp==3.7.4.post0
airr==1.3.1
alabaster==0.7.12
alembic @ file:///home/conda/feedstock_root/build_artifacts/alembic_1613901514078/work
anndata @ file:///home/conda/feedstock_root/build_artifacts/anndata_1605539061264/work
annoy @ file:///home/conda/feedstock_root/build_artifacts/python-annoy_1610271511811/work
anyio @ file:///home/conda/feedstock_root/build_artifacts/anyio_1614388751160/work/dist
arboreto==0.1.6
argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1610522574055/work
async-generator==1.10
async-timeout==3.0.1
attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1605083924122/work
Babel @ file:///home/conda/feedstock_root/build_artifacts/babel_1605182336601/work
backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
backports.functools-lru-cache==1.6.1
bbknn @ file:///opt/conda/conda-bld/bbknn_1616434096000/work
bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_16122134

In [4]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures



-----
anndata     0.7.5
scanpy      1.7.1
sinfo       0.3.1
-----
PIL                 8.1.2
anndata             0.7.5
anyio               NA
attr                20.3.0
babel               2.9.0
backcall            0.2.0
brotli              NA
cairo               1.20.0
certifi             2021.10.08
cffi                1.14.5
chardet             4.0.0
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                2021.03.1
dateutil            2.8.1
decorator           4.4.2
fsspec              0.8.7
get_version         2.1
google              NA
h5py                3.1.0
idna                2.10
igraph              0.8.3
ipykernel           5.5.0
ipython_genutils    0.2.0
ipywidgets          7.6.3
jedi                0.18.0
jinja2              2.11.3
joblib              1.0.1
json5               NA
jsonschema          3.2.0
jupyter_server      1.4.1
jupyterlab_server   2.3.0
kiwisolver          1.3.1


In [5]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202111_MFI_sc_sn_new_scVI_analysis/'

## Reading raw filtered data

In [6]:
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')

In [7]:
# read in final annotations, subset to only inv troph + VCT + VCT_p and all the final 'cell_type' labels
final_annot = pd.read_csv(save_path + 'final_annotation_all_cells_and_nulcei_20211123.csv', index_col=0)
final_annot['cell_type'].value_counts()

dS2               65746
dS1               42586
SCT               28177
VCT               23472
uSMC              17043
dEpi_secretory    16834
dM1               14099
dS3               13197
dNK2              12487
HOFB              10733
dNK1              10450
fF1               10420
PV                 9980
dT_cells           9740
dM2                9221
Endo_M             6877
VCT_p              6679
EVT_1              5289
NK                 5187
iEVT               3676
VCT_CCC            3373
MO                 3050
dNK3               2952
ILC3               2511
EVT_2              2358
T_cells            2126
VCT_fusing         1971
dT_regs            1862
Endo_L             1761
Endo_F             1466
fF2                1396
M3                 1299
B_cells             774
DC                  697
dDC                 694
Plasma              255
Granulocytes        195
dEpi_lumenal        135
eEVT                 28
GC                   19
Name: cell_type, dtype: int64

In [8]:
adata = adata[final_annot.index,:]
adata.obs['cell_type'] = final_annot.loc[adata.obs_names,'cell_type']

  res = method(*args, **kwargs)
Trying to set attribute `.obs` of view, copying.


In [9]:
# subset to only cell states of interest to test DEGs
adata = adata[adata.obs['cell_type'].isin(['VCT','VCT_p','VCT_CCC','EVT_1','EVT_2','eEVT','iEVT','GC'])]

In [10]:
adata

View of AnnData object with n_obs × n_vars = 44894 × 30800
    obs: 'n_genes', 'sample', 'technology', 'tissue', 'dev_age', 'donor', 'dataset', 'run', 'number_of_individuals_multiplexed', 'batch', 'percent_mito', 'n_counts', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'is_doublet', 'cell_type'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-14', 'feature_types-14', 'genome-14', 'n_cells-14', 'gene_ids-15', 'feature_types-15', 'genome-15', 'n_cells-15', 'gene_ids-16', 'feature_types-16', 'genome-16', 'n_cells-16', 'gene_ids-17', 'feature_types-17', 'genome-17', 'n_cells-17', 'gene_ids-18', 'feature_types-18', 'genome-18', 'n_cells-18', 'gene_ids-1

_____________________________________________________________________________________________________________________________________________________________

# DE with `limma`

Separately for the invading trophoblast in a retrograde manner for the trajectory:
- VCT_CCC vs [VCT_p + VCT]
- EVT_1 vs VCT_CCC
- EVT_2 vs EVT_1
- eEVT vs EVT_2
- iEVT vs EVT_2
- GC vs iEVT


In [11]:
adata.obs['cell_type'].value_counts()

VCT        23472
VCT_p       6679
EVT_1       5289
iEVT        3676
VCT_CCC     3373
EVT_2       2358
eEVT          28
GC            19
Name: cell_type, dtype: int64

In [12]:
# normalise and log transform values for limma
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)

# saving normalised and log transformed values into the raw attribute to later be able to plot
adata.raw = adata.copy()

adata.var_names_make_unique()
adata.obs_names_make_unique()

normalizing by total count per cell
Trying to set attribute `.obs` of view, copying.
    finished (0:00:04): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [13]:
len(np.unique(adata.obs['cell_type']))

8

In [14]:
# for the purpose of first comparison here uniting VCT and VCT_p
adata.obs['cell_type'] = ['VCT' if ct in ['VCT','VCT_p'] else ct for ct in adata.obs['cell_type']]

In [15]:
np.unique(adata.obs['cell_type'], return_counts=True)

(array(['EVT_1', 'EVT_2', 'GC', 'VCT', 'VCT_CCC', 'eEVT', 'iEVT'],
       dtype=object),
 array([ 5289,  2358,    19, 30151,  3373,    28,  3676]))

In [16]:
# downsample 'VCT' about tenfold - otherwise limma will break
VCT_indices = list(adata[adata.obs['cell_type'] == 'VCT'].obs_names)
adata_downsampled_VCT = adata[VCT_indices[:3000],:].copy()
adata_no_VCT = adata[adata.obs['cell_type'] != 'VCT'].copy()

  res = method(*args, **kwargs)


In [17]:
len(adata_no_VCT)

14743

In [18]:
adata = adata_no_VCT.concatenate(adata_downsampled_VCT, join='outer')

In [19]:
adata.obs

Unnamed: 0_level_0,n_genes,sample,technology,tissue,dev_age,donor,dataset,run,number_of_individuals_multiplexed,batch,percent_mito,n_counts,scrublet_score,scrublet_cluster_score,bh_pval,is_doublet,cell_type
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
CTGATAGAGCCCAGCT-1_FCA7167219-0,4070,FCA7167219,10X_3'_scRNA-seq,decidua_immune,13_PCW,6,Vento_Nature,,1,0,0.027002,14851.0,0.186441,0.118367,0.857143,False,EVT_2
ATTACTCTCTGCGGCA-1_FCA7167221-0,655,FCA7167221,10X_3'_scRNA-seq,decidua_immune,11_PCW,7,Vento_Nature,,1,0,0.003621,1657.0,0.175000,0.167267,0.242677,False,iEVT
CTGATAGAGCCCAGCT-1_FCA7167221-0,412,FCA7167221,10X_3'_scRNA-seq,decidua_immune,11_PCW,7,Vento_Nature,,1,0,0.044892,646.0,0.090426,0.167267,0.242677,False,EVT_2
CAGAATCCACTCTGTC-1_FCA7167222-0,1320,FCA7167222,10X_3'_scRNA-seq,decidua_non_immune,11_PCW,7,Vento_Nature,,1,0,0.000524,3820.0,0.051118,0.055810,0.874083,False,EVT_1
CCTTTCTAGATAGGAG-1_FCA7167222-0,1076,FCA7167222,10X_3'_scRNA-seq,decidua_non_immune,11_PCW,7,Vento_Nature,,1,0,0.012456,3372.0,0.060870,0.055810,0.874083,False,iEVT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACATCAGCAGCGTAAG-1_FCA7474064-1,4031,FCA7474064,10X_3'_scRNA-seq,placenta,7_PCW,10,Vento_Nature,25892.0,1,1,0.033059,20932.0,0.094972,0.131034,0.824150,False,VCT
ACATCAGCAGGGAGAG-1_FCA7474064-1,2755,FCA7474064,10X_3'_scRNA-seq,placenta,7_PCW,10,Vento_Nature,25892.0,1,1,0.053909,9516.0,0.098712,0.111111,0.874976,False,VCT
ACATCAGCATCGGAAG-1_FCA7474064-1,4900,FCA7474064,10X_3'_scRNA-seq,placenta,7_PCW,10,Vento_Nature,25892.0,1,1,0.047498,29938.0,0.209756,0.139812,0.824150,False,VCT
ACATCAGGTCAGCTAT-1_FCA7474064-1,3507,FCA7474064,10X_3'_scRNA-seq,placenta,7_PCW,10,Vento_Nature,25892.0,1,1,0.082652,12474.0,0.047161,0.142857,0.824150,False,VCT


In [20]:
np.unique(adata.obs['cell_type'], return_counts=True)

(array(['EVT_1', 'EVT_2', 'GC', 'VCT', 'VCT_CCC', 'eEVT', 'iEVT'],
       dtype=object),
 array([5289, 2358,   19, 3000, 3373,   28, 3676]))

In [25]:
np.unique(adata.obs['technology'])

array(["10X_3'_scRNA-seq", "10X_3'_snRNA-seq", '10X_multiome'],
      dtype=object)

In [26]:
# going to use origin (SN or SC) as covariate here
adata.obs['limma_cov_sn_or_sc'] = ['sn' if technology in ["10X_3'_snRNA-seq", '10X_multiome'] else 'sc' for technology in adata.obs['technology']]


In [27]:
for ct in np.unique(adata.obs['cell_type']):
    print(ct)
    curr_subset = adata[adata.obs['cell_type'] == ct]
    print(curr_subset.obs['limma_cov_sn_or_sc'].value_counts())
    print('\n')

EVT_1
sc    3947
sn    1342
Name: limma_cov_sn_or_sc, dtype: int64


EVT_2
sn    1210
sc    1148
Name: limma_cov_sn_or_sc, dtype: int64


GC
sn    19
Name: limma_cov_sn_or_sc, dtype: int64


VCT
sc    3000
Name: limma_cov_sn_or_sc, dtype: int64


VCT_CCC
sc    2268
sn    1105
Name: limma_cov_sn_or_sc, dtype: int64


eEVT
sn    26
sc     2
Name: limma_cov_sn_or_sc, dtype: int64


iEVT
sn    2597
sc    1079
Name: limma_cov_sn_or_sc, dtype: int64




In [28]:
# subset adatas by comparison
# and repeating this for others..

adata_subset = adata[adata.obs['cell_type'].isin(['VCT','VCT_CCC'])].copy()

In [91]:
# marker calling

#t = adata_hvg.X.toarray().T
t = adata_subset.X.toarray().T
df = pd.DataFrame(data=t, columns= adata_subset.obs.index, index=adata_subset.var_names)

meta_df = pd.DataFrame(data={'Cell':list(adata_subset.obs.index),
                             'cell_type':[ str(i) for i in adata_subset.obs['cell_type']],
                             'sn_or_sc':[ str(i) for i in adata_subset.obs['limma_cov_sn_or_sc']],
                             #'sample':[ str(i) for i in adata_subset.obs['sample']]
                            })
meta_df.set_index('Cell', inplace=True)

In [92]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [93]:
outpath = './limma_DEGs/20211123/'

In [94]:
%%R
library(limma)
library(edgeR)

In [95]:
%%R

sessionInfo()

R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.1 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.12.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] tools     stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] edgeR_3.32.1 limma_3.46.0

loaded via a namespace (and not attached):
[1] compiler_4.0.4  Rcpp_1.0.7      grid_4.0.4      locfit_1.5-9.4 
[5] lattice_0.20-41


In [96]:
meta_df.reset_index(inplace=True)

In [97]:
len(set(df.columns) & set(meta_df['Cell']))

6373

In [98]:
np.unique(meta_df['cell_type'], return_counts=True)

(array(['VCT', 'VCT_CCC'], dtype=object), array([3000, 3373]))

In [99]:
case = 'VCT_CCC'
ctrl = 'VCT'

In [100]:
%%R -i df -i meta_df -i outpath -i ctrl -i case 

library(limma)
library(edgeR)

# Format
ex_mat=as.matrix(df)
rownames(meta_df) = meta_df$Cell

# subset meta
meta_df = subset(meta_df, cell_type %in% unlist(c(ctrl, case)) )
print(unique(meta_df$cell_type))

# Shared cells
shared_cells = intersect(rownames(meta_df), colnames(ex_mat))
message(length(shared_cells), ' shared cells')
ex_mat = ex_mat[, shared_cells]
meta_df = meta_df[shared_cells,]

#print(dim(ex_mat))

# Filter lowly expressed genes
keep = rowSums(ex_mat, na.rm=T) > 0.1
ex_mat = ex_mat[ keep, ]
keep = aveLogCPM(ex_mat) > 0.1
ex_mat = ex_mat[ keep, ]

#print(dim(ex_mat))

# Extract celltypes
cells = rownames(meta_df)
celltypes = unique(meta_df$cell_type)
covariates = meta_df$sn_or_sc

# Extract cells in cluster and rest
cells_case = rownames(subset(meta_df, cell_type == case))
cells_ctrl = rownames(subset(meta_df, cell_type == ctrl)) # changed from control to ctrl

# build cluster_type vector
cluster_type = rep(0, length(cells))
names(cluster_type) = cells
cluster_type[ cells_case ] = 'case'
cluster_type[ cells_ctrl ] = 'ctrl'

print(unique(cluster_type))

design.matrix <- model.matrix(~ 0 + cluster_type + covariates)
#design.matrix <- model.matrix(~ 0 + cluster_type)

# Now tell limma how do you want to compare (i.e. case vs control)
contrast.matrix <- makeContrasts(caseVScontrol = cluster_typecase - cluster_typectrl, levels = design.matrix)

# Make model and run contrasts
fit <- lmFit(ex_mat, design.matrix)
fit <- contrasts.fit(fit, contrast.matrix)
fit <- eBayes(fit)

# Make a dataframe containing the important data
results = topTable(fit, adjust="fdr", number = nrow(ex_mat), coef = 'caseVScontrol')

# Add and filter needed data
results$Gene = rownames(results)
results = results[ , c('Gene', 'logFC', 'P.Value', 'adj.P.Val')]
results$AveExpr_cluster = apply(ex_mat[ results$Gene, cells_case], 1, mean)
results$AveExpr_rest = apply(ex_mat[ results$Gene, cells_ctrl], 1, mean)
results$percentExpr_cluster = apply(ex_mat[ results$Gene, cells_case], 1, function(x) sum(c(x > 0)+0) ) / length(cells_case)
results$percentExpr_rest = apply(ex_mat[ results$Gene, cells_ctrl], 1, function(x) sum(c(x > 0)+0) ) / length(cells_ctrl)

results$AveExpr_cluster = round(results$AveExpr_cluster, 6)
results$AveExpr_rest = round(results$AveExpr_rest, 6)
results$percentExpr_cluster = round(results$percentExpr_cluster, 6)
results$percentExpr_rest = round(results$percentExpr_rest, 6)
# and store it as csv file
write.csv(results, file = paste0(outpath, 
                                 '20211123_', case, '_vs_', ctrl, '_limma_DEGs.csv'), row.names = F, col.names = T, quote = F)

[1] "VCT_CCC" "VCT"    


R[write to console]: 6373 shared cells



[1] "case" "ctrl"


# Read in limma tables, filter genes by adj.P.Val (taking all with < 0.05) and format according to what CellPhone needs for further use in cell-cell comm analysis

In [6]:
DEG_tables = {}

comparisons = ['VCT_CCC_vs_VCT',
              'EVT_1_vs_VCT_CCC',
              'EVT_2_vs_EVT_1',
              'eEVT_vs_EVT_2',
              'iEVT_vs_EVT_2',
              'GC_vs_iEVT']


for comparison in comparisons:
    print(comparison)
    
    # main call type for which this DE list is
    main_ct = comparison.split('_vs_')[0]
    
    DEG_tables[comparison] = pd.read_csv('./limma_DEGs/20211123/20211123_' + comparison + '_limma_DEGs.csv', #index_col=0
                                        )
    # filter by FDR
    DEG_tables[comparison] = DEG_tables[comparison][DEG_tables[comparison]['adj.P.Val'] < 0.05]
    # filter by logFC (>0)
    DEG_tables[comparison] = DEG_tables[comparison][DEG_tables[comparison]['logFC'] > 0]
    
    # add 'cluster' labels
    DEG_tables[comparison]['cluster'] = [main_ct for idx in DEG_tables[comparison].index]
    
    DEG_tables[comparison].columns = ['gene' if column == 'Gene' else column for column in DEG_tables[comparison].columns]
    
    DEG_tables[comparison] = DEG_tables[comparison].loc[:,['cluster','gene','logFC', 'P.Value', 'adj.P.Val', 'AveExpr_cluster',
       'AveExpr_rest', 'percentExpr_cluster', 'percentExpr_rest']]

    # additionally (02.12.2021): filtering genes that are expr in <10% of cells in cluster (to be consistent with cellphone)
    #DEG_tables[comparison] = DEG_tables[comparison][DEG_tables[comparison]['percentExpr_cluster'] >= 0.1]
    
    
    print(DEG_tables[comparison].shape[0],'significant upreg DEGs here \n')

VCT_CCC_vs_VCT
4058 significant upreg DEGs here 

EVT_1_vs_VCT_CCC
4603 significant upreg DEGs here 

EVT_2_vs_EVT_1
2513 significant upreg DEGs here 

eEVT_vs_EVT_2
2148 significant upreg DEGs here 

iEVT_vs_EVT_2
4876 significant upreg DEGs here 

GC_vs_iEVT
560 significant upreg DEGs here 



In [13]:
DEG_tables = {}

comparisons = ['VCT_CCC_vs_VCT',
              'EVT_1_vs_VCT_CCC',
              'EVT_2_vs_EVT_1',
              'eEVT_vs_EVT_2',
              'iEVT_vs_EVT_2',
              'GC_vs_iEVT']


for comparison in comparisons:
    print(comparison)
    
    # main call type for which this DE list is
    main_ct = comparison.split('_vs_')[0]
    
    DEG_tables[comparison] = pd.read_csv('./limma_DEGs/20211123/20211123_' + comparison + '_limma_DEGs.csv', #index_col=0
                                        )
    # filter by FDR
    DEG_tables[comparison] = DEG_tables[comparison][DEG_tables[comparison]['adj.P.Val'] < 0.05]
    # filter by logFC (>0)
    DEG_tables[comparison] = DEG_tables[comparison][DEG_tables[comparison]['logFC'] > 0]
    
    # add 'cluster' labels
    DEG_tables[comparison]['cluster'] = [main_ct for idx in DEG_tables[comparison].index]
    
    DEG_tables[comparison].columns = ['gene' if column == 'Gene' else column for column in DEG_tables[comparison].columns]
    
    DEG_tables[comparison] = DEG_tables[comparison].loc[:,['cluster','gene','logFC', 'P.Value', 'adj.P.Val', 'AveExpr_cluster',
       'AveExpr_rest', 'percentExpr_cluster', 'percentExpr_rest']]

    # additionally (02.12.2021): filtering genes that are expr in <10% of cells in cluster (to be consistent with cellphone)
    DEG_tables[comparison] = DEG_tables[comparison][DEG_tables[comparison]['percentExpr_cluster'] >= 0.1]
    
    
    print(DEG_tables[comparison].shape[0],'significant upreg DEGs here \n')

VCT_CCC_vs_VCT
2855 significant upreg DEGs here 

EVT_1_vs_VCT_CCC
3116 significant upreg DEGs here 

EVT_2_vs_EVT_1
1433 significant upreg DEGs here 

eEVT_vs_EVT_2
1479 significant upreg DEGs here 

iEVT_vs_EVT_2
3870 significant upreg DEGs here 

GC_vs_iEVT
266 significant upreg DEGs here 



In [14]:
DEG_tables['eEVT_vs_EVT_2'].columns

Index(['cluster', 'gene', 'logFC', 'P.Value', 'adj.P.Val', 'AveExpr_cluster',
       'AveExpr_rest', 'percentExpr_cluster', 'percentExpr_rest'],
      dtype='object')

In [15]:
DEG_tables['eEVT_vs_EVT_2']

Unnamed: 0,cluster,gene,logFC,P.Value,adj.P.Val,AveExpr_cluster,AveExpr_rest,percentExpr_cluster,percentExpr_rest
0,eEVT,EIF4E1B,0.589620,2.542969e-300,6.267909e-296,0.593823,0.001864,0.607143,0.002969
1,eEVT,NCAM1,3.614782,9.408643e-295,1.159521e-290,3.795461,0.105632,1.000000,0.066582
2,eEVT,SULT1A2,0.802209,4.286850e-250,3.522076e-246,0.810678,0.006605,0.642857,0.007634
3,eEVT,TNFRSF8,0.952954,1.627569e-239,1.002908e-235,0.968758,0.009481,0.750000,0.010178
4,eEVT,ADORA1,1.202695,2.035514e-233,1.003427e-229,1.227871,0.015473,0.785714,0.015691
...,...,...,...,...,...,...,...,...,...
2231,eEVT,ZNF552,0.160193,4.467550e-03,4.933521e-02,0.260883,0.072702,0.357143,0.078032
2232,eEVT,KLHDC10,0.281399,4.504171e-03,4.971733e-02,0.697828,0.261461,0.678571,0.230704
2233,eEVT,MTR,0.257660,4.512153e-03,4.978315e-02,0.553450,0.204410,0.607143,0.196353
2234,eEVT,SDCBP2,0.051753,4.515229e-03,4.978437e-02,0.060622,0.006639,0.107143,0.007634


In [16]:
# combine all tables and save for CellPhone

In [17]:
# without any filtering
joint_DE_table = pd.concat(DEG_tables.values())

In [18]:
joint_DE_table

Unnamed: 0,cluster,gene,logFC,P.Value,adj.P.Val,AveExpr_cluster,AveExpr_rest,percentExpr_cluster,percentExpr_rest
10,VCT_CCC,TPM1,1.456768,0.000000,0.000000,2.017393,0.758216,0.882597,0.594000
14,VCT_CCC,FXYD5,0.655155,0.000000,0.000000,0.518204,0.053708,0.510821,0.074667
21,VCT_CCC,LDHA,1.148616,0.000000,0.000000,1.727698,1.231422,0.743848,0.732667
24,VCT_CCC,FABP5,0.824986,0.000000,0.000000,0.645517,0.083226,0.454788,0.120000
26,VCT_CCC,SLC16A3,0.864479,0.000000,0.000000,0.897528,0.201183,0.640972,0.251667
...,...,...,...,...,...,...,...,...,...
626,GC,PGR,0.135100,0.001135,0.048031,0.167571,0.026268,0.105263,0.030468
630,GC,AC244090.1,0.112713,0.001160,0.048682,0.128027,0.033475,0.105263,0.067737
631,GC,EPHA1-AS1,0.177610,0.001162,0.048682,0.248932,0.052469,0.157895,0.066921
632,GC,MED31,0.178074,0.001162,0.048682,0.228414,0.081405,0.157895,0.143090


In [19]:
joint_DE_table.set_index('cluster', inplace=True)
joint_DE_table

Unnamed: 0_level_0,gene,logFC,P.Value,adj.P.Val,AveExpr_cluster,AveExpr_rest,percentExpr_cluster,percentExpr_rest
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
VCT_CCC,TPM1,1.456768,0.000000,0.000000,2.017393,0.758216,0.882597,0.594000
VCT_CCC,FXYD5,0.655155,0.000000,0.000000,0.518204,0.053708,0.510821,0.074667
VCT_CCC,LDHA,1.148616,0.000000,0.000000,1.727698,1.231422,0.743848,0.732667
VCT_CCC,FABP5,0.824986,0.000000,0.000000,0.645517,0.083226,0.454788,0.120000
VCT_CCC,SLC16A3,0.864479,0.000000,0.000000,0.897528,0.201183,0.640972,0.251667
...,...,...,...,...,...,...,...,...
GC,PGR,0.135100,0.001135,0.048031,0.167571,0.026268,0.105263,0.030468
GC,AC244090.1,0.112713,0.001160,0.048682,0.128027,0.033475,0.105263,0.067737
GC,EPHA1-AS1,0.177610,0.001162,0.048682,0.248932,0.052469,0.157895,0.066921
GC,MED31,0.178074,0.001162,0.048682,0.228414,0.081405,0.157895,0.143090


In [17]:
joint_DE_table.to_csv('/lustre/scratch117/cellgen/team292/aa22/with_Luz/202111_MFI_CellPhone/DEGs_list_inv_trophoblast_for_cellphone_20211202.tsv',sep='\t')