## Müllerian duct differentiation - epithelium

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import seaborn as sns
import scipy.stats
import anndata
import os

import scipy as scipy
import scipy as sp
import pickle as pkl
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from statsmodels.nonparametric.smoothers_lowess import lowess
from numpy import asarray as ar
from collections import Counter
import networkx as nx
import igraph
import glob

In [None]:
print(anndata.__version__)

In [None]:
females_post9pcw = sc.read('/nfs/team292/vl6/FetalReproductiveTract/post_10pcw_females.20240326.h5ad')
females_post9pcw.shape

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, frameon=True, vector_friendly=True, 
                         fontsize=14, figsize=(6,6), color_map=None, format='pdf', facecolor=None, transparent=False)

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
sc.pl.umap(females_post9pcw, color="celltype")

In [None]:
sc.pl.umap(females_post9pcw, color="celltype", groups = ['FallopianTube_Epithelium', 
                'Uterus/Cervix_Epithelium', 'MüllerianVagina_Epithelium'
                                                                         ])

In [None]:
females_post9pcw = females_post9pcw[[i in ['FallopianTube_Epithelium', 
                'Uterus/Cervix_Epithelium', 'MüllerianVagina_Epithelium']
                                    for i in females_post9pcw.obs['celltype']]]

In [None]:
sc.pl.umap(females_post9pcw, color="celltype")

In [None]:
import anndata
adata = anndata.AnnData(X = females_post9pcw.raw.X, obs = females_post9pcw.obs, var = females_post9pcw.raw.var)
adata

In [None]:
adata = adata[[i == 'G1' for i in adata.obs['phase']]]

In [None]:
adata.obs['stage_pcw'].value_counts(dropna = False)

In [None]:
# Random downsampling per cell type 
import random
from itertools import chain
def downsample(adata, labels, n): 
    
    myindex = adata.obs[labels].value_counts().index 
    myvalues = adata.obs[labels].value_counts().values
    clusters = pd.Series(myvalues, index = myindex)
    
    # Find clusters with > n cells 
    cl2downsample = clusters.index[ clusters.values > n ]

    # save all barcode ids from small clusters
    holder = []
    holder.append( adata.obs_names[[ i not in cl2downsample for i in adata.obs[labels] ]] ) 

    # randomly sample n cells in the cl2downsample
    for cl in cl2downsample:
        print(cl)
        cl_sample = adata[[ i == cl for i in adata.obs[labels]]].obs_names
        
        cl_downsample = random.sample(set(cl_sample), n )
        holder.append(cl_downsample)
    
    # samples to include
    samples = list(chain(*holder))

    # Filter adata_count
    adata = adata[[ i in samples for i in adata.obs_names ]]
    return adata

In [None]:
## import reproductive tract utils functions
import sys
cwd = '/nfs/team292/vl6/Experiments/Utils/'
sys.path.append(cwd)

import reptract_utils
import reptract_genes

In [None]:
adata.shape

In [None]:
adata.raw = adata.copy()

In [None]:
sc.pp.filter_genes(adata, min_counts=10)

In [None]:
#adata = reptract_utils.per_gene_analysis(adata)

In [None]:
adata = reptract_utils.normalize_log_transform(adata)

In [None]:
adata = reptract_utils.hvgs_pca_umap(adata)

In [None]:
sc.pl.umap(adata, color=['stage_pcw', 'sample', 
                         'n_genes', 'percent_mito', 'percent_ribo', 'donor',  'doublet_scores',
                        'celltype', 'phase'], ncols = 2, wspace = 0.3, 
          color_map = 'OrRd')

In [None]:
sc.pl.umap(adata, color = [ 'FOXL2','ITGA4', 'LGR5', 'TMEM176B', 'HOXC10', 'HOXA10', 'PTGER3', 'ACTA2', 'DES', 'MYH11',
                          'HOXA13', 'ISL1', 'TMEM163', 'MMP28', 'CTNNB1',  'WNT4', 'EPCAM', 
                          'CDH5', 'PTPRC', 'DLX5', 'TP63', 'DNAH12', 'DAPL1', 'ERP27', 
                          'PAX8', 'PAX8', 'FGF20', 'FOXA1', 'RUNX3', 'DCN', 'PDGFRA', 'RHEX', 'C3', 'LMO3', 
                          'HOXC6', 'HOXA7', 'HOXB6', 'HOXB8', 'HOXC8', 'HOXC4', 'ATF3', 'NR4A1', 
                          'LYPD1', 'PNOC', 'IFI6', 'IFI27', 'UPK3B', 'KLK11'], wspace = 0.15, use_raw = False,
           color_map = 'OrRd', ncols = 2)

In [None]:
sc.tl.leiden(adata, resolution = 5)
sc.pl.umap(adata, color = ['leiden', 'doublet_scores', 'n_genes'], legend_loc = 'on data', color_map = 'OrRd')

In [None]:
sc.pl.umap(adata, color = 'leiden', 
          groups = ['46', '58', '35', '33', '47', '56'])

In [None]:
adata = adata[[i not in ['46', '58', '35', '33', '47', '56'] for i in adata.obs['leiden']]]

In [None]:
adata.obs['stage_pcw'].value_counts()

In [None]:
# adata = downsample(adata, 'stage_pcw', 1000)

In [None]:
adata = anndata.AnnData(X = adata.raw.X, var = adata.raw.var, obs = adata.obs)

In [None]:
adata.raw = adata.copy()
sc.pp.filter_genes(adata, min_counts=10)
#adata = reptract_utils.per_gene_analysis(adata)
adata = reptract_utils.normalize_log_transform(adata)
adata = reptract_utils.hvgs_pca_umap(adata)

In [None]:
sc.pl.umap(adata, color=['stage_pcw', 'sample', 
                         'n_genes', 'donor',  'doublet_scores',
                        'celltype'], ncols = 2, wspace = 0.3, 
          color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color=['stage_pcw', 'donor'], ncols = 2, wspace = 0.3, 
          color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color = ['DLX5', 'WNT7A', 'ERP27', 'SIX1', 'RHEX', 'C3', 'C1S', 'MUC5B', 
                            'PTGS2', 'STRA6', 'LTF', 'BPIFB1', 'KRT5', 'PLAT', 'LMO3', 'SIX1', 'ATF3', 'NR4A1', 
                           'PDGFRA'
                          ], wspace = 0.15, use_raw = False,
           color_map = 'OrRd', ncols = 4)

In [None]:
# adata = anndata.AnnData(X = adata.raw.X, var = adata.raw.var, obs = adata.obs)

In [None]:
# adata.raw = adata.copy()
# sc.pp.filter_genes(adata, min_counts=5)
# #adata = reptract_utils.per_gene_analysis(adata)
# adata = reptract_utils.normalize_log_transform(adata)
# adata = reptract_utils.hvgs_pca_umap(adata)

In [None]:
# sc.pl.umap(adata, color=['stage', 'donor'], ncols = 2, wspace = 0.3, 
#           color_map = 'viridis')

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
# Extract PCA matrix and batch array
n_pcs = 16 # Principal components used to perform dimensionality reduction 
pca = adata.obsm['X_pca'][:, 0:(n_pcs)]
batch = adata.obs['donor'] # Batch ID, in this case stage

In [None]:
%%R -i pca -i batch -o hem

library(harmony)
library(magrittr)
set.seed(1000)
hem <- HarmonyMatrix(pca, batch, theta=0, lambda = 6, do_pca=FALSE, verbose = FALSE)  
hem = data.frame(hem)

In [None]:
# Add harmony values to the anndata object --> overwriting PCA 
adata.obsm['X_pca'] = hem.values 

In [None]:
sc.pp.neighbors(adata, n_pcs = n_pcs, random_state= 123)
sc.tl.umap(adata, min_dist=0.6, spread = 0.8)

In [None]:
sc.pl.umap(adata, color = ['FOXL2','ITGA4', 'LGR5', 'TMEM176B', 'HOXA10', 'PTGER3', 'ACTA2', 'DES', 'MYH11',
                          'HOXA13', 'ISL1', 'TMEM163', 'MMP28', 'CTNNB1',  'WNT4', 'EPCAM', 
                          'CDH5', 'DLX5', 'TP63', 'DNAH12', 'DAPL1', 'ERP27', 
                          'PAX8', 'PAX8', 'FGF20', 'FOXA1', 'RUNX3', 'DCN', 'PDGFRA', 'CALCA', 'RXRG', 'UPK3B', 'CHL1', 'MUC5B'], wspace = 0.15, use_raw = False,
           color_map = 'OrRd', ncols = 2)

In [None]:
sc.pl.umap(adata, color=['stage_pcw', 'sample', 
                         'n_genes', 'donor',  'doublet_scores', 'percent_mito',
                        'celltype', 'percent_mito'], ncols = 2, wspace = 0.3, 
          color_map = 'viridis')

In [None]:
sc.tl.leiden(adata, resolution = 0.2)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.tl.draw_graph(adata)

In [None]:
sc.pl.draw_graph(adata, color='leiden', legend_loc='on data')


In [None]:
sc.pl.draw_graph(adata, color='n_genes', color_map='OrRd')


In [None]:
sc.tl.paga(adata, groups='leiden')


In [None]:
sc.pl.paga(adata, color=['leiden', 'TMEM163', 'LGR5', 'ITGA4'])


In [None]:
sc.tl.draw_graph(adata, init_pos='paga')


In [None]:
sc.pl.draw_graph(adata, color='leiden', legend_loc='on data')


In [None]:
import anndata
bdata = anndata.AnnData(X = adata.raw.X, var = adata.raw.var, obs = adata.obs)

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
for col in bdata.obs.columns: 
    if col != 'leiden':
        del bdata.obs[col]

In [None]:
%%R -i bdata
bdata

In [None]:
%%R -o mrks

library(SoupX)
counts <- assay(bdata, "X")
colnames(counts) <- colnames(bdata)
rownames(counts) <- rownames(bdata)
mrks = quickMarkers(counts, colData(bdata)$leiden, N = 100) # where clusters is the cell type assignment and 100 means I want the top 100 genes per cluster that pass the hypergeometric test

In [None]:
sc.pl.umap(adata, color = ['stage_pcw','leiden', 'donor'],  ncols = 2)

In [None]:
cluster = mrks[mrks['cluster'] == '0']['gene'].to_list()[0:15]
sc.pl.umap(adata, color = cluster, color_map = 'OrRd', ncols = 3, use_raw = False)

In [None]:
sc.pl.umap(adata, color = ['MMP7', 'NDUFA4L2', 'NXPH4', 'OPRK1', 'DLX5', 'UCA1', 'MSX1', 'PAX8', 'RUNX3', 'RHEX', 'MUC5B', 'KRT5', 'FOXA1', 'TP63', 'WNT7A', 'LGR5', 'WNT11', 'FGF9', 
                          'SIX1', 'UNC5D', 'ATF3', 'NR4A1', 'KLF6', 'CREB5', 'KLF10', 'KLF4', 'NFKBIA', 'FGF20', 'PNOC', 'LYPD1', 
                          'LYPD1', 'PNOC', 'DMBT1', 'KLK8', 'TGM1',  'BCAT1',
                         'LTF', 'SCGB2A1', 'THSD4', 'RASD1', 'MUC6', 'DNAI1', 'NOTUM', 'CTNNA2',
                         'ST3GAL2', 'LGR5', 'OPRL1', 'TSPAN8'], color_map = 'OrRd', use_raw = False)

In [None]:
sc.pl.umap(adata, color = ['n_genes', 'n_counts', 'percent_mito'], color_map = 'OrRd')

In [None]:
sc.pl.umap(adata, color = [i for i in ['CALCA', 'GDNF', 'CHL1', 'JAG1', 'JAG2', 'WNT6', 'NCAM1', 'RARRES', 'CDH3', 'CDH2', 'BMP7', 'MDK', 'EFNA5', 'CMKLR1', 
             'CADM1', 'ACVR', 'GATA6', 'WT1', 'BNC2', 'HOXD8', 'HOXC4', 'STAT1', 'LCORL', 'KLF8', 'FOSL2', 'MAFB'] if i in adata.raw.var_names], ncols = 4, color_map = 'OrRd', 
          use_raw = False)

In [None]:
sc.pl.umap(adata, color = [i for i in ['GAS6', 'AR', 'CD47', 'NTF3', 'SOSTDC1', 'CD47', 'APOA1', 'NTF3', 'NR3C1', 'NOTCH2', 'FGF2', 'EFNB2', 
           'DHCR24'] if i in adata.raw.var_names], ncols = 4, color_map = 'OrRd', 
          use_raw = False)

In [None]:
sc.pl.umap(adata, color = [i for i in ['YES1', 'TIAM1', 'SRC', 'ROCK1', 'VAV2', 'VAV3', 'CDC42', 'CRK', 'DNM1', 'GRB2', 'GRB7'] if i in adata.raw.var_names], ncols = 4, color_map = 'OrRd', 
          use_raw = False)

In [None]:
sc.pl.umap(adata, color = [i for i in ['ROR2', 'NTN4', 'FGFR2', 'ALDH1A3', 'VEGFA', 'SEMA3C', 'PTPRK', 'CD44', 'IGF1R', 'CD44', 'WNT7A', 
          'WNT11', 'FGF9', 'TCN', 'SLIT2', 'LGALS9', 'LAMP1', 'LAMC1', 'HLA-C', 'LRP5', 'EGFR', 'CD74', 
          'CD46', 'CD44', 'ACKR3', 'FZD10', 'ALDH1A1', 'ALDH1A2', 'ALDH1A3', 'HOXB5', 'HOXB6', 'HOXB7', 'HOXB8',
                                      'HOXB9', 'ATF2', 'ATF6', 'CUX2', 'BACH2', 'GATA6', 'WT1', 'ROR1', 'NRP1', 'NRP2', 'FGF7', 
                                      'FGF2', 'EFNB2', 'EPHB4', 'EPGB3', 'EPHB1', 'EPHA4', 'IGF1', 'IGFR1', 'CDH2', 'BMP2', 'BMP4', 'BMP7'] if i in adata.raw.var_names], ncols = 4, color_map = 'OrRd', 
          use_raw = False)

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
anno1 = {'0' : 'Uterus/Cervix_Epithelium_early', '1' : 'FallopianTube_Epithelium_early', '2' : 'Uterus/Cervix_Epithelium_late', 
        '3' : 'FallopianTube_Epithelium_late', '4' : 'MüllerianVagina_Epithelium', 
        '5' : 'Uterus/Cervix_Epithelium_late'}

anno2 = {'0' : 'Uterus/Cervix_Epithelium', '1' : 'FallopianTube_Epithelium', '2' : 'Uterus/Cervix_Epithelium', 
        '3' : 'FallopianTube_Epithelium', '4' : 'MüllerianVagina_Epithelium', 
        '5' : 'Uterus/Cervix_Epithelium',}


In [None]:
adata.obs['epi_mullerian_lowres'] = adata.obs.leiden.map(anno2)
adata.obs['epi_mullerian_highres'] = adata.obs.leiden.map(anno1)

In [None]:
sc.pl.umap(adata, color = ['stage_pcw', 'donor', 'epi_mullerian_lowres', 'epi_mullerian_highres'], 
          ncols = 2, save = '_epi_mullerian')

In [None]:
sc.pl.umap(adata, color = ['epi_mullerian_lowres'], 
         )

In [None]:
adata.write('/nfs/team292/vl6/FetalReproductiveTract/mullerian_epi_late_post10pcw.h5ad')

In [None]:
adata.obs['epi_mullerian_highres'] = adata.obs['epi_mullerian_highres'].astype('category')
adata.obs['epi_mullerian_highres'] = adata.obs['epi_mullerian_highres'].cat.reorder_categories([
    'Epithelial_MüllerianDuct', 'Epithelial_FallopianTube_early', 'Epithelial_FallopianTube_late',
    'Epithelial_Uterus_early', 'Epithelial_Uterus_late'
])

In [None]:
sc.pl.umap(adata, color = ['epi_mullerian_highres'], save = '_epi_mull_highres'
         )

In [None]:
adata.obs['epi_mullerian_lowres'] = adata.obs['epi_mullerian_lowres'].astype('category')
adata.obs['epi_mullerian_lowres'] = adata.obs['epi_mullerian_lowres'].cat.reorder_categories([
    'Epithelial_MüllerianDuct', 'Epithelial_FallopianTube', 'Epithelial_Uterus'
])

In [None]:
sc.pl.umap(adata, color = ['epi_mullerian_lowres'], save = '_epi_mull_lowres'
         )