# Müllerian duct emergence

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import seaborn as sns
import scipy.stats
import anndata
import os

import scipy as scipy
import scipy as sp
import pickle as pkl
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from statsmodels.nonparametric.smoothers_lowess import lowess
from numpy import asarray as ar
from collections import Counter
import networkx as nx
import igraph
import glob

In [None]:
print(anndata.__version__)

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
adata = sc.read('/nfs/team292/vl6/FetalReproductiveTract/all.202401.h5ad')
adata.shape

In [None]:
early_annots = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/early_annots.csv', index_col = 0)
males_late_annots = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/males_post10pcw_annots.csv', index_col = 0)
females_late_annots = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/post_10pcw_females.csv', index_col = 0)
print(early_annots.shape, males_late_annots.shape, females_late_annots.shape)

In [None]:
annots = early_annots.append(males_late_annots)
annots = annots.append(females_late_annots)

In [None]:
adata.obs['celltype'] = adata.obs_names.map(annots['celltype'])

In [None]:
# Remove NaN cells 
adata.obs['celltype'] = adata.obs['celltype'].astype(str)
adata = adata[[i != 'nan' for i in adata.obs['celltype']]]

In [None]:
adata.shape

## Select coelomic epithelium, Müllerian duct epithelium and mesenchyme

In [None]:
adata = adata[[i in ['MüllerianDuct_Epithelium', 'Coelomic_Epithelium', 
                    'MüllerianDuct_Mesenchyme'] for i in adata.obs['celltype']]]
adata.shape

In [None]:
adata.obs['celltype'].value_counts()

In [None]:
adata.obs['phase'] = adata.obs_names.map(annots['phase'])

In [None]:
adata = adata[[i == 'G1' for i in adata.obs['phase']]]

In [None]:
coelepi = adata[[i == 'Coelomic_Epithelium' for i in adata.obs['celltype']]]
coelepi_remove = coelepi[[i > 8.0 for i in coelepi.obs['stage_pcw']]]
coelepi_remove.shape

In [None]:
adata = adata[[i not in coelepi_remove.obs_names.to_list() for i in adata.obs_names.to_list()]]

In [None]:
adata.shape

In [None]:
adata.obs['stage_pcw'].value_counts()

In [None]:
adata = adata[[i < 9 for i in adata.obs['stage_pcw']]]

In [None]:
adata.shape

In [None]:
sc.pp.filter_genes(adata, min_counts = 10)

In [None]:
adata.shape

In [None]:
## import reproductive tract utils functions
import sys
cwd = '/nfs/team292/vl6/Experiments/Utils/'
sys.path.append(cwd)

import reptract_utils
import reptract_genes

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, frameon=True, vector_friendly=True, 
                         fontsize=14, figsize=(7, 7), color_map=None, format='pdf', facecolor=None, transparent=False)

In [None]:
adata.raw = adata.copy()

In [None]:
#adata = reptract_utils.per_gene_analysis(adata)

In [None]:
adata = reptract_utils.normalize_log_transform(adata)

In [None]:
adata = reptract_utils.hvgs_pca_umap(adata)

In [None]:
sc.tl.umap(adata, min_dist=0.6, spread = 0.8)

In [None]:
sc.pl.umap(adata, color=['stage_pcw', 'sample', 
                         'sex', 'donor', 'n_genes', 'doublet_scores',
                        'celltype'], ncols = 2, wspace = 0.3, 
          color_map = 'OrRd')

In [None]:
markers = ['UPK3B', 'LRRN4', 'KLK11', 'HOXA10', 'HOXA11', 'EMX2', 'EMX1', 'SP7', 'MSX2', 'DLX5', 'AMHR2', 'HOXC4', 'HOXC5', 'CNTN1', 'MMP28', 'ETV4', 'LYPD1', 'TMEM163', 'RUNX2', 'PCSK2', 'RGS5', 'BCAN', 'AR', 
          'PTGES', 'EGFL6', 'TMEM176B', 'LGR5', 'CAVIN2', 'ITGBL1', 'SPOCK1', 'ANKFN1', 'ITGA4', 'SEMA3A', 'PRDM1','ESR1',
           'HOXA10', 'HOXA11', 'HOXC10',  'DACT2', 'CLDN1', 
          ]

sc.pl.umap(adata, color = markers, color_map = 'OrRd', ncols = 3, use_raw = False)

In [None]:
markers = ['WNT7A', 'FGF20', 'CALCA', 'GDNF', 'PNOC', 'LYPD1', 'RXRG', 'ERP27', 'DAPL1', 'FOXL2'
          ]

sc.pl.umap(adata, color = markers, color_map = 'OrRd', ncols = 3, use_raw = False)

In [None]:
sc.pl.umap(adata, color = [ 'MMP2', 'SP7', 'DLX5', 'DLX6', 'BCAN', 'WIF1', 'PCSK2', 'RGS5', 'BCAN', 
                          ], wspace = 0.15, use_raw = False,
           color_map = 'OrRd', ncols = 2)

In [None]:
sc.pl.umap(adata, color = [ 'NKD1', 'NOTUM', 'ERP27', 'DAPL1', 'LAMP5', 'UCA1', 'DLX5', 'HOXA7', 'HOXC5', 
                          'HOXA9', 'HOXA10', 'HOXC10', 'HOXA13', 'GAP43', 'TNC', 'RDH10', 
                          'EMX2'], wspace = 0.15, use_raw = False,
           color_map = 'OrRd', ncols = 3)

In [None]:
sc.tl.leiden(adata, resolution = 4)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data',
          groups = ['60', '58', '50', '8', '0', '15',  '20', '53', '28', '2', '22'])

In [None]:
adata = adata[[i not in ['60', '58', '50', '8', '0', '15',  '20', '53', '28', '2', '22'] for i in adata.obs['leiden']]]

In [None]:
adata.shape

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
adata = anndata.AnnData(X = adata.raw.X, var = adata.raw.var, obs = adata.obs)

In [None]:
adata.raw = adata.copy()
sc.pp.filter_genes(adata, min_counts=10)
#adata = reptract_utils.per_gene_analysis(adata)
adata = reptract_utils.normalize_log_transform(adata)
adata = reptract_utils.hvgs_pca_umap(adata)

In [None]:
sc.pl.umap(adata, color=['stage_pcw', 'sample', 
                         'sex', 'donor', 'n_genes', 'doublet_scores',
                        'celltype'], ncols = 2, wspace = 0.3, 
          color_map = 'OrRd')

In [None]:
sc.pl.umap(adata, color = ['MMP28', 'TMEM163', 'BCAN', 'BRINP1', 'HOXA11', 'HOXA10', 'HOXC10', 'CNTN1', 
                          'ECM1', 'WIF1', 'SP7','NOTUM'], use_raw = False, legend_loc = 'on data', color_map = 'OrRd')

In [None]:
sc.tl.leiden(adata, resolution = 4)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data',
          groups = ['12', '52', '38', '31'])

In [None]:
adata = adata[[i not in ['12', '52', '38', '31'] for i in adata.obs['leiden']]]

In [None]:
adata = anndata.AnnData(X = adata.raw.X, var = adata.raw.var, obs = adata.obs)

In [None]:
adata.raw = adata.copy()
sc.pp.filter_genes(adata, min_counts=10)
#adata = reptract_utils.per_gene_analysis(adata)
adata = reptract_utils.normalize_log_transform(adata)
adata = reptract_utils.hvgs_pca_umap(adata)

In [None]:
sc.pl.umap(adata, color=['stage_pcw', 'sample', 
                         'sex', 'donor', 'n_genes', 'doublet_scores',
                        'celltype'], ncols = 2, wspace = 0.3, 
          color_map = 'OrRd')

### harmony integration

In [None]:
# Extract PCA matrix and batch array
n_pcs = 13 # Principal components used to perform dimensionality reduction 
pca = adata.obsm['X_pca'][:, 0:(n_pcs)]
batch = adata.obs['donor'] # Batch ID, in this case stage

In [None]:
%%R -i pca -i batch -o hem

library(harmony)
library(magrittr)
set.seed(1000)
hem <- HarmonyMatrix(pca, batch, theta=0, lambda = 6, do_pca=FALSE, verbose = FALSE)  
hem = data.frame(hem)

In [None]:
# Add harmony values to the anndata object --> overwriting PCA 
adata.obsm['X_pca'] = hem.values 

In [None]:
sc.pp.neighbors(adata, n_pcs = n_pcs, random_state= 123)
sc.tl.umap(adata, min_dist=0.6, spread = 0.8)

In [None]:
sc.pl.umap(adata, color=['stage_pcw', 'sample', 
                         'sex', 'donor', 'n_genes', 'doublet_scores',
                        'celltype'], ncols = 2, wspace = 0.3, 
          color_map = 'OrRd')

In [None]:
sc.pl.umap(adata, color = ['MMP28', 'TMEM163', 'BCAN', 'BRINP1', 'HOXA11', 'HOXA10', 'HOXC10', 'CNTN1', 
                          'ECM1', 'WIF1', 'SP7','NOTUM', 'NKD1', 'APOA1', 'CRTAC1', 'GAL', 'COX7A1', 'ADRA2A'], use_raw = False, legend_loc = 'on data', color_map = 'OrRd')

In [None]:
sc.pp.neighbors(adata, n_neighbors=8, use_rep='X_pca')

In [None]:
sc.tl.draw_graph(adata)

In [None]:
sc.tl.leiden(adata, resolution = 0.5)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
adata = adata[[i not in ['11'] for i in adata.obs['leiden']]]

In [None]:
sc.tl.leiden(adata, resolution = 0.4)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
adata = adata[[i not in ['8'] for i in adata.obs['leiden']]]

In [None]:
sc.tl.leiden(adata, resolution = 0.5)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
adata = adata[[i not in ['10'] for i in adata.obs['leiden']]]

In [None]:
# sc.tl.leiden(adata, resolution = 0.4)
# sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
# adata = adata[[i not in ['8'] for i in adata.obs['leiden']]]

In [None]:
sc.pl.draw_graph(adata, color='leiden', legend_loc='on data')


In [None]:
sc.pl.draw_graph(adata, color='leiden', legend_loc='on data')


In [None]:
sc.pl.draw_graph(adata, color='n_genes', color_map='OrRd')


In [None]:
sc.tl.paga(adata, groups='leiden')


In [None]:
sc.pl.paga(adata, color=['leiden', 'UPK3B', 'GDNF', 'SP7'])


In [None]:
sc.tl.draw_graph(adata, init_pos='paga')


In [None]:
sc.pl.draw_graph(adata, color='leiden')


In [None]:
sc.pl.draw_graph(adata, color = [ 'MMP2', 'SP7', 'DLX5', 'DLX6', 'BCAN', 'WIF1', 'PCSK2', 'RGS5', 'BCAN'], wspace = 0.15, use_raw = False,
           color_map = 'OrRd', ncols = 2)

In [None]:
adata

## Save as RDS object

In [None]:
mull_R = anndata.AnnData(X = adata.raw.X, obs = adata.obs, var = adata.raw.var)
del mull_R.obsp

In [None]:
mull_R

In [None]:
mito_genes = [name for name in mull_R.var_names if name.startswith('MT-')]


In [None]:
# remove mito genes
non_mito_genes = [name for name in mull_R.var_names if not name.startswith('MT-')]
mull_R = mull_R[:, non_mito_genes]

# remove ribo genes
non_ribo_genes = [name for name in mull_R.var_names if not name.startswith('RP')]
mull_R = mull_R[:, non_ribo_genes]

# remove heat shock protein genes
non_hps_genes = [name for name in mull_R.var_names if not name.startswith('HSP')]
mull_R = mull_R[:, non_hps_genes]

# remove haemoglobin genes 
non_haemo_genes = [name for name in mull_R.var_names if name not in ['HBA1', 'HBA2', 'HBB', 'HBG1', 'HBG2']]
mull_R = mull_R[:, non_haemo_genes]

sc.pp.filter_genes(mull_R, min_cells=10)
sc.pp.filter_genes(mull_R, min_counts=1)

In [None]:
mull_R.obsm = adata.obsm

In [None]:
mull_R

In [None]:
sc.pl.umap(mull_R, color = 'leiden', legend_loc = 'on data')

In [None]:
mull_R.write("/nfs/team292/vl6/FetalReproductiveTract/mullerian_early.h5ad")

In [None]:
mull_R = sc.read("/nfs/team292/vl6/FetalReproductiveTract/mullerian_early.h5ad")
mull_R.shape

In [None]:
%%R -i mull_R
mull_R

In [None]:
%%R 
saveRDS(mull_R, "/nfs/team292/vl6/FetalReproductiveTract/mullerian_early.rds")

## Prepare data for CellPhoneDB analysis between Wolffian and Müllerian ducts

### Import Müllerian duct object

In [None]:
mull = sc.read("/nfs/team292/vl6/FetalReproductiveTract/mullerian_early.h5ad")
mull.shape

In [None]:
mull.obs['celltype'].value_counts()

In [None]:
### Remove coelomic epithelium

In [None]:
mull = mull[[i not in ['Coelomic_Epithelium'] for i in mull.obs['celltype']]]
mull.shape

In [None]:
### Select Wolffian duct cells < 9 PCW
wolff = adata[[i in ['WolffianDuct_Epithelium', 'WolffianDuct_Mesenchyme'] for i in adata.obs['celltype']]]
wolff.shape

In [None]:
wolff = wolff[[i < 9.0 for i in wolff.obs['stage_pcw']]]
wolff.shape

In [None]:
sc.pp.filter_genes(wolff, min_counts = 10)

In [None]:
wolff.raw = wolff.copy()
wolff = reptract_utils.normalize_log_transform(wolff)
wolff = reptract_utils.hvgs_pca_umap(wolff)
sc.tl.umap(wolff, min_dist=0.6, spread = 0.8)
sc.pl.umap(wolff, color=['stage_pcw', 'sample', 
                         'sex', 'donor', 'n_genes', 'doublet_scores',
                        'celltype'], ncols = 2, wspace = 0.3, 
          color_map = 'OrRd')

In [None]:
markers = ['UPK3B', 'LRRN4', 'KLK11', 'HOXA10', 'HOXA11', 'EMX2', 'EMX1', 'SP7', 'MSX2', 'DLX5', 'AMHR2', 'HOXC4', 'HOXC5', 'CNTN1', 'MMP28', 'ETV4', 'LYPD1', 'TMEM163', 'RUNX2', 'PCSK2', 'RGS5', 'BCAN', 'AR', 
          'PTGES', 'EGFL6', 'TMEM176B', 'LGR5', 'CAVIN2', 'ITGBL1', 'SPOCK1', 'ANKFN1', 'ITGA4', 'SEMA3A', 'PRDM1','ESR1',
           'HOXA10', 'HOXA11', 'HOXC10',  'DACT2', 'CLDN1', 'WNT9B', 'MARCH11', 'GATA3', 'KRT17', 
           'PDZK1', 'PDZK1IP1', 'ESR1', 'GLYAT', 'SPINK2', 'PLAC1', 'HOXA7', 'HOXA9', 'ADH1B', 'ALDH1A1'
          ]

sc.pl.umap(wolff, color = markers, color_map = 'OrRd', ncols = 3, use_raw = False)

In [None]:
wolff.shape

In [None]:
wolff_raw = anndata.AnnData(X = wolff.raw.X, obs = wolff.obs, var = wolff.raw.var)
del wolff.obsp

In [None]:
wolff_raw.shape

In [None]:
### Filter genes
# remove mito genes
non_mito_genes = [name for name in wolff_raw.var_names if not name.startswith('MT-')]
wolff_raw = wolff_raw[:, non_mito_genes]

# remove ribo genes
non_ribo_genes = [name for name in wolff_raw.var_names if not name.startswith('RP')]
wolff_raw = wolff_raw[:, non_ribo_genes]

# remove heat shock protein genes
non_hps_genes = [name for name in wolff_raw.var_names if not name.startswith('HSP')]
wolff_raw = wolff_raw[:, non_hps_genes]

# remove haemoglobin genes 
non_haemo_genes = [name for name in wolff_raw.var_names if name not in ['HBA1', 'HBA2', 'HBB', 'HBG1', 'HBG2']]
wolff_raw = wolff_raw[:, non_haemo_genes]

sc.pp.filter_genes(wolff_raw, min_cells=10)
sc.pp.filter_genes(wolff_raw, min_counts=1)

In [None]:
wolff_raw.shape

### Concatenate Müllerian and Wolffian objects

In [None]:
mull.X[20:25, 40:45].toarray()

In [None]:
wolff_raw.X[20:25, 40:45].toarray()

In [None]:
mull_wolff = mull.concatenate(wolff_raw, join='outer', index_unique=None)
mull_wolff.X = mull_wolff.X.tocsr()
mull_wolff.shape

In [None]:
mull_wolff.obs['celltype'].value_counts(dropna = False)

In [None]:
## Downsample per cell type 
import random
from itertools import chain
def downsample(adata, labels, n): 
    
    myindex = adata.obs[labels].value_counts().index 
    myvalues = adata.obs[labels].value_counts().values
    clusters = pd.Series(myvalues, index = myindex)
    
    # Find clusters with > n cells 
    cl2downsample = clusters.index[ clusters.values > n ]

    # save all barcode ids from small clusters
    holder = []
    holder.append( adata.obs_names[[ i not in cl2downsample for i in adata.obs[labels] ]] ) 

    # randomly sample n cells in the cl2downsample
    for cl in cl2downsample:
        print(cl)
        cl_sample = adata[[ i == cl for i in adata.obs[labels]]].obs_names
        
        cl_downsample = random.sample(set(cl_sample), n )
        holder.append(cl_downsample)
    
    # samples to include
    samples = list(chain(*holder))

    # Filter adata_count
    adata = adata[[ i in samples for i in adata.obs_names ]]
    return adata

In [None]:
mull_wolff_downsampled = downsample(mull_wolff, 'celltype', 1713)

In [None]:
mull_wolff_downsampled.shape

In [None]:
mull_wolff_downsampled.write('/nfs/team292/vl6/FetalReproductiveTract/pre_9pcw_mullerian_wolffian_cellphoneDB.h5ad')

### Differential expression with Seurat

In [None]:
%%R -o results
library(biomaRt)
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
results <- getBM(attributes = c("chromosome_name", "hgnc_symbol"),
            filters = "chromosome_name", values = "Y", mart = mart, verbose = TRUE)

head(results)

In [None]:
results = results.set_index('hgnc_symbol')

In [None]:
mull_wolff.var['Y_chrom'] = mull_wolff.var_names.map(results['chromosome_name'].to_dict())

In [None]:
mull_wolff.var['Y_chrom'].value_counts(dropna = False)

In [None]:
ychrom = mull_wolff.var[mull_wolff.var['Y_chrom'] == 'Y'].index.to_list()
keep = [i for i in mull_wolff.var_names.to_list() if i not in ychrom]
len(keep)

In [None]:
mull_wolff = mull_wolff[:, keep]

In [None]:
mull_wolff.shape

In [None]:
sc.pp.normalize_per_cell(mull_wolff, counts_per_cell_after=1e4)
sc.pp.log1p(mull_wolff)

In [None]:
mull_wolff_degs = mull_wolff[mull_wolff_downsampled.obs_names.to_list(), :]

In [None]:
mull_wolff_degs.shape

In [None]:
del mull_wolff_degs.var
del mull_wolff_degs.obsm
del mull_wolff_degs.uns
for i in mull_wolff_degs.obs.columns:
    if i != 'celltype':
        del mull_wolff_degs.obs[i]

In [None]:
mull_wolff_degs

In [None]:
%%R -i mull_wolff_degs
mull_wolff_degs

In [None]:
%%R -o DEGs

library(Seurat)
so = as.Seurat(mull_wolff_degs, counts = "X", data = "X")
Idents(so) = so$celltype

# # Normalize
# so <- NormalizeData(so, normalization.method = "LogNormalize", scale.factor = 10000)
# so <- ScaleData(so, features = rownames(so))

# Extract DEGs for each cell_type
DEGs <- FindAllMarkers(so, 
                       verbose = F, 
                       only.pos = T, 
                       random.seed = 1, 
                       logfc.threshold = 0, 
                       min.pct = 0.1, 
                       return.thresh = 1)

DEGs$cluster = factor(DEGs$cluster, levels = sort(unique(DEGs$cluster)) )
# topgenes_DEGs = subset(DEGs, qval <= 0.05) %>%
#   group_by(cluster) %>%
#   group_map(~ head(.x, 30L)$gene) %>%
#   unlist(.)

In [None]:
DEGs

In [None]:
DEGs.to_csv('/nfs/team292/vl6/FetalReproductiveTract/pre_9pcw_mullerian_wolffian_DEGs.csv', index=False, sep='\t')

### Plot cellphoneDB results

In [None]:
mull_wolff.X[20:25, 40:45].toarray()

In [None]:
sc.pl.dotplot(mull_wolff, var_names = ['ALDH1A1', 'WNT9B', 'NTN4', 'KITLG', 
                                      'CXCL12'], 
             standard_scale = 'var', groupby = 'celltype', swap_axes = True, 
             save = '_wolffian_ligands')

In [None]:
sc.pl.dotplot(mull_wolff, var_names = ['RXRG', 'RARG', 'FZD10', 'LRP5',
                                    'UNC5B',  'KIT', 
                                      'CXCR4'], 
             standard_scale = 'var', groupby = 'celltype', swap_axes = True, 
             save = '_mullerian_ligands')