# Müllerian & Wolffian ducts analysis - prepare data for CellphoneDB

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os
import sys

def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-cellphonedb/'
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

'/opt/conda/envs/cpdb/bin/python'

# Prepare INPUT

## Define microenvironments

In [2]:
microenvironments = {'Early': ['Müllerian Epi', 'Wolffian Epi', 
                              'Müllerian Mese', 'Wolffian/Mesonephros Mese']
                    }

microenvironments

{'Early': ['Müllerian Epi',
  'Wolffian Epi',
  'Müllerian Mese',
  'Wolffian/Mesonephros Mese']}

In [3]:
celltypes_of_interest = sum(microenvironments.values(), [])
set(celltypes_of_interest)

{'Müllerian Epi',
 'Müllerian Mese',
 'Wolffian Epi',
 'Wolffian/Mesonephros Mese'}

## Load raw counts andata

In [4]:
adataDown = sc.read('/nfs/team292/vl6/FetalReproductiveTract/pre_9pcw_mullerian_wolffian_cellphoneDB.h5ad')
adataDown

AnnData object with n_obs × n_vars = 6284 × 23295
    obs: 'n_counts', 'n_genes', 'sample', 'percent_mito', 'percent_ribo', 'doublet_scores', 'barcode', 'souporcell_classification', 'souporcell_assignment', 'leiden', 'leiden_R', 'leiden_2', 'broad_annotations', 'donor', 'stage_pcw', 'stage_cs', 'sex', 'location', 'multiplexing', 'notes', 'library', 'leiden_3', 'HTO2', 'HTO3', 'HTO4', 'total_hto_counts', 'fraction_HTO2', 'fraction_HTO3', 'fraction_HTO4', 'HTO3_HTO4_ratio', 'most_likely_hypothesis', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification', 'hashsolo_assignment', 'hash+lib', 'sample+donor', 'batch', 'celltype', 'phase'
    var: 'GeneID-0-0', 'GeneName-0-0', 'n_cells-0-0', 'GeneID-1-0', 'GeneName-1-0', 'n_cells-1-0', 'GeneID-10-0', 'GeneName-10-0', 'n_cells-10-0', 'GeneID-100-0', 'GeneName-100-0', 'n_cells-100-0', 'GeneID-101-0', 'GeneName-101-0', 'n_cells-101-0', 'GeneID-11-0', 'GeneName-11-

In [5]:
adataDown.X[20:30, 20:30].toarray()

array([[0., 0., 1., 0., 3., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1., 1., 0., 1., 0., 0.],
       [0., 0., 1., 2., 2., 1., 0., 3., 0., 0.],
       [4., 0., 2., 1., 2., 0., 0., 2., 0., 0.],
       [0., 0., 2., 1., 4., 0., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 1., 0., 3., 0., 0.],
       [0., 0., 0., 0., 3., 1., 0., 0., 0., 0.],
       [0., 0., 2., 0., 0., 0., 0., 0., 0., 0.],
       [2., 0., 2., 3., 0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 1., 3., 2., 0., 0., 0., 0.]], dtype=float32)

# Save normalised counts gene expression 

Generate the normalized counts

In [6]:
sc.pp.filter_genes(adataDown, min_cells = 10)
sc.pp.normalize_per_cell(adataDown, counts_per_cell_after=1e4)
adataDown.X.shape

(6284, 20582)

In [7]:
adataDown.obs['celltype'].value_counts(dropna = False)

celltype
Müllerian Epi                1571
Müllerian Mese               1571
Wolffian Epi                 1571
Wolffian/Mesonephros Mese    1571
Name: count, dtype: int64

In [8]:
adataDown.obs['stage_pcw'].value_counts(dropna = False)

stage_pcw
6.6    1838
8.4    1709
7.4     640
7.0     512
5.6     486
8.8     389
7.8     373
8.0     170
6.2     167
Name: count, dtype: int64

In [9]:
adataDown.write('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/counts_normalised.h5ad')

In [10]:
def grouped_obs_percent(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        X.data = X.data > 0.01
        perc = np.asarray(np.sum(X,axis=0)/X.shape[0]).reshape(-1)
        out[group] = [round(i, 2) for i in perc ]
    return out


def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

In [11]:
percent = grouped_obs_percent(adataDown, 'celltype')
pd.DataFrame(percent).to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/counts_percent.csv')

means = grouped_obs_mean(adataDown, 'celltype')
pd.DataFrame(means).to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/counts_mean.csv')

# Save meta

In [12]:
df_meta = pd.DataFrame(data={'Cell':list(adataDown.obs.index),
                             'cell_type':[ i for i in adataDown.obs['celltype']] })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/meta.tsv', sep = '\t')

# Save microenvironments

In [13]:
mylist = [(key, x) for key,val in microenvironments.items() for x in val]
df_microenvironments = pd.DataFrame(mylist, columns=['microenvironment', 'celltype'])
df_microenvironments

Unnamed: 0,microenvironment,celltype
0,Early,Müllerian Epi
1,Early,Wolffian Epi
2,Early,Müllerian Mese
3,Early,Wolffian/Mesonephros Mese


In [14]:
# 1st column = cluster; 2nd column = microenvironment 
df_microenvironments = df_microenvironments[[ 'celltype', 'microenvironment']]
df_microenvironments.to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/microenvironments.tsv', index=False, sep='\t')

## Save pre-calculated DEGs

We previously estimated DEGs for each celltype within each lineage (hierarchycal analysis). 
Here we will upload each DEGs analyssi (one per lineage) and assemble a single file with the upregulated genes in the celltypes of interests.

In [15]:
DEGs = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/pre_9pcw_mullerian_wolffian_DEGs.csv', sep = '\t',header=0)
DEGs.head()

Unnamed: 0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,cluster,gene
0,0.0,1.292841,0.872,0.103,0.0,Müllerian Epi,PDLIM1
1,0.0,1.202645,0.93,0.133,0.0,Müllerian Epi,CDH2
2,0.0,1.139028,0.781,0.084,0.0,Müllerian Epi,PNOC
3,0.0,1.120034,0.854,0.303,0.0,Müllerian Epi,MAP1B
4,0.0,1.020191,0.596,0.067,0.0,Müllerian Epi,PCP4


In [16]:
# Filter significant DEGs
cond1 = DEGs['p_val_adj'] < 0.01 
cond2 = DEGs['avg_log2FC'] > 0.5
cond3 = [i in celltypes_of_interest for i in DEGs['cluster']]
mask = [all(tup) for tup in zip(cond1, cond2, cond3)]
fDEGs = DEGs[mask]

In [17]:
print(fDEGs['cluster'].value_counts())
fDEGs.head()

cluster
Wolffian Epi                 183
Wolffian/Mesonephros Mese    107
Müllerian Epi                 91
Müllerian Mese                72
Name: count, dtype: int64


Unnamed: 0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,cluster,gene
0,0.0,1.292841,0.872,0.103,0.0,Müllerian Epi,PDLIM1
1,0.0,1.202645,0.93,0.133,0.0,Müllerian Epi,CDH2
2,0.0,1.139028,0.781,0.084,0.0,Müllerian Epi,PNOC
3,0.0,1.120034,0.854,0.303,0.0,Müllerian Epi,MAP1B
4,0.0,1.020191,0.596,0.067,0.0,Müllerian Epi,PCP4


### Format table to match cellphoneDB format

In [18]:
# 1st column = cluster; 2nd column = gene 
fDEGs = fDEGs[['cluster', 'gene', 'p_val_adj', 'p_val', 'avg_log2FC', 'pct.1', 'pct.2']] 
fDEGs.to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/DEGs_upregulated_genes.tsv', index=False, sep='\t')