# Müllerian & Wolffian ducts analysis - prepare data for CellphoneDB

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os
import sys

def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-cellphonedb/'
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

# Prepare INPUT

## Define microenvironments

In [None]:
microenvironments = {'Early': ['MüllerianDuct_Epithelium', 'WolffianDuct_Epithelium', 
                              'MüllerianDuct_Mesenchyme', 'WolffianDuct_Mesenchyme']
                    }

microenvironments

In [None]:
celltypes_of_interest = sum(microenvironments.values(), [])
set(celltypes_of_interest)

## Load raw counts andata

In [None]:
adataDown = sc.read('/nfs/team292/vl6/FetalReproductiveTract/pre_9pcw_mullerian_wolffian_cellphoneDB.h5ad')
adataDown

In [None]:
adataDown.X[20:30, 20:30].toarray()

# Save normalised counts gene expression 

Generate the normalized counts

In [None]:
sc.pp.filter_genes(adataDown, min_cells = 10)
sc.pp.normalize_per_cell(adataDown, counts_per_cell_after=1e4)
adataDown.X.shape

In [None]:
adataDown.obs['celltype'].value_counts(dropna = False)

In [None]:
adataDown.obs['stage_pcw'].value_counts(dropna = False)

In [None]:
adataDown.write('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/counts_normalised.h5ad')

In [None]:
def grouped_obs_percent(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        X.data = X.data > 0.01
        perc = np.asarray(np.sum(X,axis=0)/X.shape[0]).reshape(-1)
        out[group] = [round(i, 2) for i in perc ]
    return out


def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

In [None]:
percent = grouped_obs_percent(adataDown, 'celltype')
pd.DataFrame(percent).to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/counts_percent.csv')

means = grouped_obs_mean(adataDown, 'celltype')
pd.DataFrame(means).to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/counts_mean.csv')

# Save meta

In [None]:
df_meta = pd.DataFrame(data={'Cell':list(adataDown.obs.index),
                             'cell_type':[ i for i in adataDown.obs['celltype']] })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/meta.tsv', sep = '\t')

# Save microenvironments

In [None]:
mylist = [(key, x) for key,val in microenvironments.items() for x in val]
df_microenvironments = pd.DataFrame(mylist, columns=['microenvironment', 'celltype'])
df_microenvironments

In [None]:
# 1st column = cluster; 2nd column = microenvironment 
df_microenvironments = df_microenvironments[[ 'celltype', 'microenvironment']]
df_microenvironments.to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/microenvironments.tsv', index=False, sep='\t')

## Save pre-calculated DEGs

We previously estimated DEGs for each celltype within each lineage (hierarchycal analysis). 
Here we will upload each DEGs analyssi (one per lineage) and assemble a single file with the upregulated genes in the celltypes of interests.

In [None]:
DEGs = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/pre_9pcw_mullerian_wolffian_DEGs.csv', sep = '\t',header=0)
DEGs.head()

In [None]:
# Filter significant DEGs
cond1 = DEGs['p_val_adj'] < 0.01 
cond2 = DEGs['avg_log2FC'] > 0.5
cond3 = [i in celltypes_of_interest for i in DEGs['cluster']]
mask = [all(tup) for tup in zip(cond1, cond2, cond3)]
fDEGs = DEGs[mask]

In [None]:
print(fDEGs['cluster'].value_counts())
fDEGs.head()

### Format table to match cellphoneDB format

In [None]:
# 1st column = cluster; 2nd column = gene 
fDEGs = fDEGs[['cluster', 'gene', 'p_val_adj', 'p_val', 'avg_log2FC', 'pct.1', 'pct.2']] 
fDEGs.to_csv('/nfs/team292/vl6/FetalReproductiveTract/CellPhoneDB/Mullerian_and_Wolffian_early/input/DEGs_upregulated_genes.tsv', index=False, sep='\t')