In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import os
import sys
import scipy


def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)


sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './genital_tubercle_figures_humans/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

In [None]:
adata = sc.read('/nfs/team292/vl6/FetalReproductiveTract/all.202401.h5ad')
adata.shape

In [None]:
## Load annotations from per-view integrations
early_annots = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/early_annots.csv', index_col = 0)
males_late_annots = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/males_post10pcw_annots.csv', index_col = 0)
females_late_annots = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/post_10pcw_females.csv', index_col = 0)
print(early_annots.shape, males_late_annots.shape, females_late_annots.shape)

In [None]:
annots = early_annots.append(males_late_annots)
annots = annots.append(females_late_annots)

In [None]:
annots.shape

In [None]:
adata.obs['celltype'] = adata.obs_names.map(annots['celltype'])
adata.obs['phase'] = adata.obs_names.map(annots['phase'])

In [None]:
adata.obs['phase'].value_counts()

In [None]:
adata = adata[[i == 'G1' for i in adata.obs['phase']]]
adata.shape

In [None]:
adata.obs['stage_pcw'].hist()

## 1. Select the genital tubercle / external genitalia

In [None]:
list(adata.obs['celltype'].unique())

In [None]:
genital_celltypes = [
 'ExternalGenitalia_SubdermalPrepuce', 'ExternalGenitalia_Glans',
 'Labial_Swelling',
 'ExternalGenitalia_Prepuce',
 'ExternalGenitalia_CorpusCavernosum',  'ExternalGenitalia_CorpusSpongiosum',
                     'LabioScrotal_Swelling',
 'Genital_Epidermis',
 'GenitalTubercle_Glans',  'Urethral_Epithelium', 
                     'Preputial_Lamina',
 'GenitalTubercle_Prepuce',
 'GenitalTubercle_CorpusSpongiosum',
 'GenitalTubercle_CorpusCavernosum',  'UrethralPlate', 'Urethral_Plate'
]

In [None]:
adata = adata[[i in genital_celltypes for i in adata.obs['celltype']]]
adata.obs['celltype'].value_counts()

In [None]:
adata.shape

In [None]:
## Eliminate small clusters of doublets 
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[7,7], color_map=None, 
                         format='pdf', facecolor=None, transparent=False,)

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
adata.obs['sample'].value_counts()

In [None]:
excl_samples = ['FCA_GND10375782', 'HD_F_GON11389672', 'HD_F_GON12893250', 'HD_F_GON12893251_Hrv139', 
          'HD_F_GON12944919_Hrv161', 'HD_F_GON13657891_Hrv217', 
          'HD_F_GON11389673', 'HD_F_GON13074328', 'HD_F_GON12944919_Hrv160', 'HD_F_GON13174529', 
           'HD_F_GON11282098', 
          'HD_F_GON11282100', 'HD_F_GON14449733_C167', 'HD_F_GON12944919_C132', 
          'HD_F_GON12129680', 'HD_F_GON11151634', 'HD_F_GON13174528', 'HD_F_GON12873752',
           'HD_F_GON13247928_C152', 'HD_F_GON13447622',
          'HD_F_GON13182001', 'HD_F_GON12302487', 'HD_F_GON12409311', 'HD_F_GON13679794_C153']

In [None]:
adata = adata[[i not in excl_samples for i in adata.obs['sample']]]

In [None]:
adata.obs[['sex', 'stage_pcw']].value_counts()

In [None]:
adata.obs['sample'].value_counts()

In [None]:
adata.obs['sex'].value_counts()

In [None]:
adata.X[20:30, 20:30].toarray()

## 2. Restrict to masculinisation programming windown (8-14 PCW)

In [None]:
## import reproductive tract utils functions
cwd = '/nfs/team292/vl6/Experiments/Utils'
sys.path.append(cwd)

import reptract_utils
import reptract_genes

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[7,7], color_map=None, 
                         format='pdf', facecolor=None, transparent=False)


In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
path_to_data = '/nfs/team292/vl6/FetalReproductiveTract/'

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
%%R
library(SingleCellExperiment)
library(biomaRt)
library(data.table)
library(scater)
library(BiocParallel)

In [None]:
import anndata
mpw = adata.copy()

In [None]:
mpw = mpw[[8.0 <= x <= 14.0 for x in mpw.obs['stage_pcw']]]

In [None]:
mpw.shape

In [None]:
mpw.obs[['stage_pcw', 'donor']].value_counts()

In [None]:
mpw.obs[['sex']].value_counts()

In [None]:
sc.pp.filter_genes(mpw, min_counts = 10)

In [None]:
mpw.raw = mpw.copy()
# Normalize and log-transform 
mpw = reptract_utils.normalize_log_transform(mpw)
# HVGs, PCA, KNN & UMAP
mpw = reptract_utils.hvgs_pca_umap(mpw)

In [None]:
sc.pl.umap(mpw, color = ['phase', 'stage_pcw', 'sex', 'donor'], ncols = 2)
sc.pl.umap(mpw, color = 'celltype')

In [None]:
# Extract PCA matrix and batch array
n_pcs = 16 # Principal components used to perform dimensionality reduction 
pca = mpw.obsm['X_pca'][:, 0:(n_pcs)]
batch = mpw.obs['donor'] # Batch ID, in this case stage

In [None]:
%%R -i pca -i batch -o hem

library(harmony)
library(magrittr)
set.seed(1000)
hem <- HarmonyMatrix(pca, batch, theta=0, lambda = 6, do_pca=FALSE, verbose = FALSE)  
hem = data.frame(hem)

In [None]:
# Add harmony values to the anndata object --> overwriting PCA 
mpw.obsm['X_pca_harmony'] = hem.values 

In [None]:
sc.pp.neighbors(mpw, n_pcs = n_pcs, use_rep = 'X_pca_harmony', random_state= 123)
sc.tl.umap(mpw, min_dist=0.6, spread = 0.8)

In [None]:
sc.pl.umap(mpw, color = ['phase', 'stage_pcw', 'sex', 'donor'], ncols = 2)
sc.pl.umap(mpw, color = 'celltype')

In [None]:
# Dotplot 
sc.pl.umap(mpw, color = ['FOXA1', 'TP63', 'KRT5', 'DHRS2', 'UPK3A', 'PSCA', 
          'KRT1', 'KRTDAP', 'DSG1', 'KRT14', 'WNT3', 'WNT6','FRZB', 'MSX2', 'DLX5', 'FOXF1', 'SALL1', 'GRID2', 
          'SOX9', 'PRR16', 'ZFHX4', 'SHOX2', 'FAM162B', 'VGLL3',
          'GRIA4', 'IRX1', 'DKK1', 'FXYD7', 'MFAP5', 'GFRA1', 'FOXL2'], use_raw = False, color_map = 'OrRd')

In [None]:
sc.tl.leiden(mpw, resolution = 0.7)
sc.pl.umap(mpw, color = 'leiden')

In [None]:
sc.pl.umap(mpw, color = 'leiden', legend_loc = 'on data')

In [None]:
mpw = mpw[[i not in ['6', '16'] for i in mpw.obs['leiden']]]

In [None]:
genitaltubercle_celltype = {
    '0' : 'Prepuce',
    '1' : 'Corpus Cavernosum', 
    '2' : 'Urethral Plate', 
    '3' : 'Glans', 
    '4' : 'Glans', 
    '5' : 'Corpus Spongiosum',
    '6' : 'Corpus Spongiosum', 
    '7' : 'Labio-Scrotal Swelling', 
    '8' : 'Subdermal Prepuce', 
    '9' : 'Corpus Cavernosum', 
    '10' : 'Prepuce', 
    '11' : 'Preputial Lamina',
    '12' : 'Glans', 
    '13' : 'Genital Epidermis', 
    '14' : 'Genital Epidermis', 
    '15' : 'Urethral Plate', 
    '16' : 'Glans'
    
    
}
mpw.obs['genitaltubercle_celltype'] = mpw.obs.leiden.map(genitaltubercle_celltype)

In [None]:
mpw.obs['genitaltubercle_celltype'].value_counts(dropna = False)

In [None]:
mpw.obs['genitaltubercle_celltype'] = mpw.obs['genitaltubercle_celltype'].astype('category')

In [None]:
mpw.obs['genitaltubercle_celltype'] = mpw.obs['genitaltubercle_celltype'].astype('category')
mpw.obs['genitaltubercle_celltype'] = mpw.obs['genitaltubercle_celltype'].cat.reorder_categories([
    'Urethral Plate',  'Genital Epidermis', 'Preputial Lamina',
     'Glans', 'Corpus Spongiosum', 'Corpus Cavernosum', 
    
     'Prepuce', 'Subdermal Prepuce', 'Labio-Scrotal Swelling'
])

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[7,7], color_map=None, 
                         format='pdf', facecolor=None, transparent=False)

In [None]:
color_dict = {
    'Urethral Plate' : 'purple',  
    'Genital Epidermis' : 'palevioletred', 'Preputial Lamina' : 'pink',
     'Glans' : 'navy', 'Corpus Spongiosum' : 'deepskyblue',
   
    'Corpus Cavernosum' : 'forestgreen', 
     'Prepuce' : 'yellowgreen', 'Subdermal Prepuce' : 'lightseagreen', 'Labio-Scrotal Swelling' : 'teal'
}

In [None]:
sc.pl.umap(mpw, color = 'genitaltubercle_celltype', 
           palette = color_dict,
           save = '_mpw.pdf')

In [None]:
sc.pl.umap(mpw, color = 'donor', 
           save = '_mpw_donor.pdf')

In [None]:
sc.pl.umap(mpw, color = 'stage_pcw', 
           save = '_mpw_stage_pcw.pdf')

In [None]:
sc.pl.umap(mpw, color = 'sex', palette = ['#F74F8A', '#0286FA'], 
           save = '_mpw_sex.pdf')

In [None]:
markers = {'Epithelial' : ['FOXA1', 'SHH','UPK3A', 
                          'PSCA',  'KRT1', 'KRTDAP', 'KRT14', 'WNT3', 
                          ], 
          'Mesenchymal' : ['DLX5', 'SP9', 'FOXF1', 'SALL1', 'GRID2', 'FOXL2', 
                          'PRR16', 'SOX9',  'SHOX2', 'SHOX', 'IRX1', 'GRIA4', 
                          'FXYD7', 'GFRA1']}
sc.pl.dotplot(mpw, var_names = markers, groupby = 'genitaltubercle_celltype', 
             standard_scale = 'var', color_map = 'OrRd', save = '_genitaltubercle_MPW.pdf')

In [None]:
mpw.raw.X[20:30, 20:30].toarray()

In [None]:
mpw.write('/nfs/team292/vl6/FetalReproductiveTract/genital_tubercle_mpw.h5ad')

In [None]:
mpw = sc.read('/nfs/team292/vl6/FetalReproductiveTract/genital_tubercle_mpw.h5ad')

In [None]:
mpw

## 3. Differential expression between sexes per cell type

In [None]:
import anndata
mpw_raw = anndata.AnnData(X = mpw.raw.X, var = mpw.raw.var, obs = mpw.obs)
mpw_raw.shape

In [None]:
# remove mito genes
non_mito_genes = [name for name in mpw_raw.var_names if not name.startswith('MT-')]
mpw_raw = mpw_raw[:, non_mito_genes]

# remove ribo genes
non_ribo_genes = [name for name in mpw_raw.var_names if not name.startswith('RP')]
mpw_raw = mpw_raw[:, non_ribo_genes]

# remove heat shock protein genes
non_hps_genes = [name for name in mpw_raw.var_names if not name.startswith('HSP')]
mpw_raw = mpw_raw[:, non_hps_genes]


In [None]:
import decoupler as dc

In [None]:
%%R -o results
library(biomaRt)
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
results <- getBM(attributes = c("chromosome_name", "hgnc_symbol"),
            filters = "chromosome_name", values = "Y", mart = mart, verbose = TRUE)

head(results)

In [None]:
results.head()

In [None]:
results = results.set_index('hgnc_symbol')

In [None]:
mpw_raw.var['Y_chrom'] = mpw_raw.var_names.map(results['chromosome_name'].to_dict())

In [None]:
mpw_raw.var['Y_chrom'].value_counts(dropna = False)

In [None]:
ychrom = mpw_raw.var[mpw_raw.var['Y_chrom'] == 'Y'].index.to_list()

In [None]:
keep = [i for i in mpw_raw.var_names.to_list() if i not in ychrom]

In [None]:
len(keep)

In [None]:
mpw_raw = mpw_raw[:, keep]

In [None]:
mpw_raw.shape

In [None]:
mpw = mpw[:, keep]

In [None]:
mpw.layers["counts"] = mpw_raw.X.copy()

In [None]:
mpw.layers["counts"].toarray()[20:30, 20:30]

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    mpw,
    sample_col='donor',
    groups_col='genitaltubercle_celltype',
    layer='counts',
    mode='sum',
    min_cells=0,
    min_counts=0
)

In [None]:
dc.plot_psbulk_samples(pdata, groupby=['donor', 'genitaltubercle_celltype'], figsize=(12, 4))

In [None]:
# Get filtered pseudo-bulk profile
pdata = dc.get_pseudobulk(
    mpw,
    sample_col='donor',
    groups_col='genitaltubercle_celltype',
    layer='counts',
    mode='sum',
    min_cells=10,
    min_counts=1000
)
pdata

In [None]:
# Store raw counts in layers
pdata.layers['counts'] = pdata.X.copy()

# Normalize, scale and compute pca
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)
sc.tl.pca(pdata)

# Return raw counts to X
dc.swap_layer(pdata, 'counts', X_layer_key=None, inplace=True)

In [None]:
sc.pl.pca(pdata, color=['sex', 'genitaltubercle_celltype'], ncols=1, size=300)
sc.pl.pca_variance_ratio(pdata)

In [None]:
dc.get_metadata_associations(
    pdata,
    obs_keys = ['sex', 'genitaltubercle_celltype', 'psbulk_n_cells', 'psbulk_counts'],  # Metadata columns to associate to PCs
    obsm_key='X_pca',  # Where the PCs are stored
    uns_key='pca_anova',  # Where the results are stored
    inplace=True,
)

In [None]:
dc.plot_associations(
    pdata,
    uns_key='pca_anova',  # Summary statistics from the anova tests
    obsm_key='X_pca',  # where the PCs are stored
    stat_col='p_adj',  # Which summary statistic to plot
    obs_annotation_cols = ['sex', 'genitaltubercle_celltype'], # which sample annotations to plot
    titles=['Principle component scores', 'Adjusted p-values from ANOVA'],
    figsize=(7, 5),
    n_factors=10,
)

## Sexual dimorphism in the urethral plate

In [None]:
cs = pdata[pdata.obs['genitaltubercle_celltype'] == 'Urethral Plate'].copy()

In [None]:
dc.plot_filter_by_expr(cs, group='sex', min_count=50, min_total_count=60)

In [None]:
# Obtain genes that pass the thresholds
genes = dc.filter_by_expr(cs, group='sex', min_count=50, min_total_count=15)

# Filter by these genes
cs = cs[:, genes].copy()
cs

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
# Build DESeq2 object
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    adata=cs,
    design_factors='sex',
    ref_level=['sex', 'female'],
    refit_cooks=True,
    inference=inference,
)

In [None]:
# Compute LFCs
dds.deseq2()

In [None]:
# Extract contrast between males vs females
stat_res = DeseqStats(
    dds,
    contrast=["sex", 'male', 'female'],
    inference=inference,
)

In [None]:
# Compute Wald test
stat_res.summary()

In [None]:
# Extract results
results_df = stat_res.results_df
results_df

In [None]:
dc.plot_volcano_df(
    results_df,
    x='log2FoldChange',
    y='padj',
    lFCs_thr = 1.25,
    sign_thr = 0.05,
    top = 51,
    color_pos = 'deepskyblue',
    color_neg = 'pink',
    color_null='lightgray',
    lFCs_limit = 5,
    figsize=(7, 5), 
    save = '_urethralplateuroplakins_DE.pdf'
)

In [None]:
results_df = results_df[(results_df['log2FoldChange'] > 0.25) | (results_df['log2FoldChange'] < -0.25)]

In [None]:
results_df = results_df[results_df['padj'] < 0.05]

In [None]:
results_df.shape

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
results_df.sort_values('log2FoldChange')

In [None]:
results_df.to_csv('/nfs/team292/vl6/FetalReproductiveTract/genital_tubercle_urethral_plate_DE_025LogFC.csv')

## Sexual dimorphism in the corpus spongiosum

In [None]:
cs = pdata[pdata.obs['genitaltubercle_celltype'] == 'Corpus Spongiosum'].copy()

In [None]:
dc.plot_filter_by_expr(cs, group='sex', min_count=50, min_total_count=60)

In [None]:
# Obtain genes that pass the thresholds
genes = dc.filter_by_expr(cs, group='sex', min_count=50, min_total_count=15)

# Filter by these genes
cs = cs[:, genes].copy()
cs

In [None]:
# Build DESeq2 object
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    adata=cs,
    design_factors='sex',
    ref_level=['sex', 'female'],
    refit_cooks=True,
    inference=inference,
)

In [None]:
# Compute LFCs
dds.deseq2()

In [None]:
# Extract contrast between males vs females
stat_res = DeseqStats(
    dds,
    contrast=["sex", 'male', 'female'],
    inference=inference,
)

In [None]:
# Compute Wald test
stat_res.summary()


In [None]:
# Extract results
results_df = stat_res.results_df
results_df

In [None]:
results_df.loc['MAFB']

In [None]:
dc.plot_volcano_df(
    results_df,
    x='log2FoldChange',
    y='padj',
    lFCs_thr = 1.25,
    sign_thr = 0.05,
    top = 60,
    color_pos = 'deepskyblue',
    color_neg = 'pink',
    color_null='lightgray',
    lFCs_limit = 4,
    figsize=(8, 5), 
    save = '_corpusspongiosum_DE.pdf'
)

In [None]:
results_df = results_df[(results_df['log2FoldChange'] > 0.25) | (results_df['log2FoldChange'] < -0.25)]

In [None]:
results_df = results_df[results_df['padj'] < 0.05]

In [None]:
results_df.shape

In [None]:
results_df.sort_values('log2FoldChange')

In [None]:
results_df.to_csv('/nfs/team292/vl6/FetalReproductiveTract/genital_tubercle_corpus_spongiosum_DE_025LogFC.csv')

## DEGs for CellPhoneDB with Seurat 

In [None]:
path2data = '/nfs/team292/vl6/FetalReproductiveTract/'
adata_cpdb = sc.read(path2data + 'genital_tubercle_mpw.h5ad')
adata_cpdb.X.shape

In [None]:
adata_cpdb.obs['genitaltubercle_celltype'] = adata_cpdb.obs['genitaltubercle_celltype'].astype(str)
adata_cpdb.obs['sex'] = adata_cpdb.obs['sex'].astype(str)
# Separate between male and female 
adata_cpdb.obs['genitaltubercle_celltype+sex'] = adata_cpdb.obs['genitaltubercle_celltype'] + '_' + adata_cpdb.obs['sex']
sc.pl.umap(adata_cpdb, color = 'genitaltubercle_celltype+sex')

In [None]:
celltypes_of_interest = ['Urethral Plate_female', 'Urethral Plate_male', 'Corpus Spongiosum_female', 
                        'Corpus Spongiosum_male']

In [None]:
adata_cpdb = adata_cpdb[[ i in celltypes_of_interest for i in adata_cpdb.obs['genitaltubercle_celltype+sex'] ]]
sc.pl.umap(adata_cpdb, color=['genitaltubercle_celltype+sex'])
print(adata_cpdb.obs['genitaltubercle_celltype+sex'].value_counts())

In [None]:
adata_cpdb.obs['celltype'] = adata_cpdb.obs['genitaltubercle_celltype+sex'].copy()

In [None]:
adata_cpdb.raw.X[20:25, 40:45].toarray()

In [None]:
import anndata
adata_cpdb = anndata.AnnData(X = adata_cpdb.raw.X, var = adata_cpdb.raw.var, obs = adata_cpdb.obs)
adata_cpdb.shape

In [None]:
### Filter genes
# remove mito genes
non_mito_genes = [name for name in adata_cpdb.var_names if not name.startswith('MT-')]
adata_cpdb = adata_cpdb[:, non_mito_genes]

# remove ribo genes
non_ribo_genes = [name for name in adata_cpdb.var_names if not name.startswith('RP')]
adata_cpdb = adata_cpdb[:, non_ribo_genes]

# remove heat shock protein genes
non_hps_genes = [name for name in adata_cpdb.var_names if not name.startswith('HSP')]
adata_cpdb = adata_cpdb[:, non_hps_genes]

# remove haemoglobin genes 
non_haemo_genes = [name for name in adata_cpdb.var_names if name not in ['HBA1', 'HBA2', 'HBB', 'HBG1', 'HBG2']]
adata_cpdb = adata_cpdb[:, non_haemo_genes]

sc.pp.filter_genes(adata_cpdb, min_cells=10)
sc.pp.filter_genes(adata_cpdb, min_counts=1)

In [None]:
adata_cpdb.shape

In [None]:
%%R -o results
library(biomaRt)
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
results <- getBM(attributes = c("chromosome_name", "hgnc_symbol"),
            filters = "chromosome_name", values = "Y", mart = mart, verbose = TRUE)

head(results)

In [None]:
results = results.set_index('hgnc_symbol')

In [None]:
adata_cpdb.var['Y_chrom'] = adata_cpdb.var_names.map(results['chromosome_name'].to_dict())

In [None]:
adata_cpdb.var['Y_chrom'].value_counts(dropna = False)

In [None]:
ychrom = adata_cpdb.var[adata_cpdb.var['Y_chrom'] == 'Y'].index.to_list()
keep = [i for i in adata_cpdb.var_names.to_list() if i not in ychrom]
len(keep)

In [None]:
adata_cpdb = adata_cpdb[:, keep]

In [None]:
adata_cpdb.shape

In [None]:
sc.pp.normalize_per_cell(adata_cpdb, counts_per_cell_after=1e4)
sc.pp.log1p(adata_cpdb)

In [None]:
del adata_cpdb.var
del adata_cpdb.obsm
del adata_cpdb.uns
for i in adata_cpdb.obs.columns:
    if i != 'celltype':
        del adata_cpdb.obs[i]

In [None]:
adata_cpdb_cs = adata_cpdb[[i in ['Corpus Spongiosum_female', 'Corpus Spongiosum_male'] for i in adata_cpdb.obs['celltype']]]
adata_cpdb_cs.obs['celltype'].value_counts(dropna = False)

In [None]:
%%R -i adata_cpdb_cs
adata_cpdb_cs

In [None]:
%%R -o DEGs_cs

library(Seurat)
so = as.Seurat(adata_cpdb_cs, counts = "X", data = "X")
Idents(so) = so$celltype

# # Normalize
# so <- NormalizeData(so, normalization.method = "LogNormalize", scale.factor = 10000)
# so <- ScaleData(so, features = rownames(so))

# Extract DEGs for each cell_type
DEGs_cs <- FindAllMarkers(so, 
                       verbose = F, 
                       only.pos = T, 
                       random.seed = 1, 
                       logfc.threshold = 0, 
                       min.pct = 0.1, 
                       return.thresh = 1)

DEGs_cs$cluster = factor(DEGs_cs$cluster, levels = sort(unique(DEGs_cs$cluster)) )
# topgenes_DEGs = subset(DEGs, qval <= 0.05) %>%
#   group_by(cluster) %>%
#   group_map(~ head(.x, 30L)$gene) %>%
#   unlist(.)

In [None]:
DEGs_cs

In [None]:
DEGs_cs.to_csv('/nfs/team292/vl6/FetalReproductiveTract/genital_tubercle_cs_DEGs.csv', index=False, sep='\t')

In [None]:
DEGs_cs.loc['JAG1']

In [None]:
adata_cpdb_up = adata_cpdb[[i in ['Urethral Plate_female', 'Urethral Plate_male'] for i in adata_cpdb.obs['celltype']]]
adata_cpdb_up.obs['celltype'].value_counts(dropna = False)

In [None]:
%%R -i adata_cpdb_up
adata_cpdb_up

In [None]:
%%R -o DEGs_up

library(Seurat)
so = as.Seurat(adata_cpdb_up, counts = "X", data = "X")
Idents(so) = so$celltype

# # Normalize
# so <- NormalizeData(so, normalization.method = "LogNormalize", scale.factor = 10000)
# so <- ScaleData(so, features = rownames(so))

# Extract DEGs for each cell_type
DEGs_up <- FindAllMarkers(so, 
                       verbose = F, 
                       only.pos = T, 
                       random.seed = 1, 
                       logfc.threshold = 0, 
                       min.pct = 0.1, 
                       return.thresh = 1)

DEGs_up$cluster = factor(DEGs_up$cluster, levels = sort(unique(DEGs_up$cluster)) )
# topgenes_DEGs = subset(DEGs, qval <= 0.05) %>%
#   group_by(cluster) %>%
#   group_map(~ head(.x, 30L)$gene) %>%
#   unlist(.)

In [None]:
DEGs_up.loc['NOTCH2']

In [None]:
DEGs_up.loc['CD46']

In [None]:
DEGs_up.to_csv('/nfs/team292/vl6/FetalReproductiveTract/genital_tubercle_up_DEGs.csv', index=False, sep='\t')

In [None]:
DEGs = DEGs_cs.append(DEGs_up)

In [None]:
DEGs.to_csv('/nfs/team292/vl6/FetalReproductiveTract/genital_tubercle_cs_up_DEGs.csv', index=False, sep='\t')

## ATAC visualisations

In [None]:
outDir = '/lustre/scratch126/cellgen/team292/vl6/pycistopic/male_genital_tubercle/'

In [None]:
atac_annots = pd.read_csv(outDir + 'male_genital_tubercle_embedding.csv', index_col = 0)
atac_annots.head()

In [None]:
# Create fake matrix 
fake_matrix = np.zeros([6258, 20000])
fake_matrix.shape

In [None]:
fake_vars = pd.DataFrame({'n_genes' : [5] * 20000})

In [None]:
import anndata 
adata = anndata.AnnData(X = fake_matrix, var = fake_vars, obs = atac_annots)

In [None]:
adata.obsm['X_umap'] = atac_annots[['tsne1', 'tsne2']].to_numpy()

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[7,7], color_map=None, 
                         format='pdf', facecolor=None, transparent=False)

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
adata.obs.columns

In [None]:
sc.pl.umap(adata, color = 'Stage_PCW', cmap = 'viridis', save = '_atac_male_genital_tubercle_stage')

In [None]:
color_dict = {'Glans': 'navy',
 'CorpusCavernosum': 'forestgreen',
 'Prepuce': 'yellowgreen',
 'CorpusSpongiosum': 'deepskyblue',
 'SubdermalPrepuce': 'lightseagreen',
 'LabioScrotalSwelling': 'teal',
 'GenitalEpidermis': 'palevioletred',
 'UrethralPlate': 'purple',
 'PreputialLamina': 'pink'}

In [None]:
sc.pl.umap(adata, color = 'genital_lowres', 
           palette = color_dict, save = '_atac_male_genital_tubercle_celltype')

## 4. Prepare for cross-species analysis

## 1. Human

In [None]:
mpw

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
%%R
library(SingleCellExperiment)
library(biomaRt)
library(data.table)
library(scater)
library(BiocParallel)

In [None]:
mpw.shape

In [None]:
human_gene_names = mpw.var_names.to_list()

In [None]:
len(human_gene_names)

In [None]:
%%R -i human_gene_names -o mouse_one2one

human_gene_names <- unlist(human_gene_names)
ensembl <- useEnsembl(
  biomart = "genes",
  dataset = "hsapiens_gene_ensembl",
  version = 111, 
    mirror = 'www'
)

mouse_ort <- getBM(
  attributes = c("ensembl_gene_id", "external_gene_name", "mmusculus_homolog_ensembl_gene",
                 "mmusculus_homolog_associated_gene_name",
                 "mmusculus_homolog_orthology_type", "mmusculus_homolog_orthology_confidence"), 
  filters = 'external_gene_name',
  values = human_gene_names,
  mart = ensembl
)

mouse_ort <- as.data.table(mouse_ort)
mouse_one2one <- mouse_ort[mmusculus_homolog_orthology_type=="ortholog_one2one"]


In [None]:
# mouse_one2one = mouse_one2one[mouse_one2one['mmusculus_homolog_orthology_confidence'] == 1]

In [None]:
mouse_one2one.shape

In [None]:
mouse_one2one

## Subset the human anndata object to the common genes with mouse 

In [None]:
mouse = sc.read('/nfs/team292/vl6/Mouse_RepTract/Amato2021/Amato2021_mese_epi_annotated.h5ad')
mouse

In [None]:
common_genes_human_mouse = mouse_one2one[mouse_one2one['mmusculus_homolog_associated_gene_name'].isin(mouse.var_names.to_list())]

In [None]:
common_genes_human_mouse.shape

In [None]:
import anndata
mpw_orthologs = anndata.AnnData(X = mpw.raw.X, var = mpw.raw.var, obs = mpw.obs)
mpw_orthologs.shape

In [None]:
mpw_orthologs = mpw_orthologs[:, common_genes_human_mouse['external_gene_name'].to_list()]

In [None]:
mpw_orthologs.layers["raw_counts"] = mpw_orthologs.X.copy()

In [None]:
# Normalize and log-transform 
reptract_utils.normalize_log_transform(mpw_orthologs)
mpw_orthologs.layers["log_normalised_counts"] = mpw_orthologs.X.copy()

In [None]:
# Find HVGs 
sc.pp.highly_variable_genes(mpw_orthologs, batch_key = 'donor', n_top_genes = 4000)

In [None]:
mpw_orthologs.var['highly_variable'].value_counts()

### MOUSE

In [None]:
mouse_orthologs = anndata.AnnData(X = mouse.raw.X, var = mouse.raw.var, obs = mouse.obs)
mouse_orthologs.shape

In [None]:
mouse_orthologs = mouse_orthologs[:, common_genes_human_mouse['mmusculus_homolog_associated_gene_name'].to_list()]
mouse_orthologs.shape

In [None]:
mouse_orthologs.layers["raw_counts"] = mouse_orthologs.X.copy()

In [None]:
# Normalize and log-transform 
reptract_utils.normalize_log_transform(mouse_orthologs)
mouse_orthologs.layers["log_normalised_counts"] = mouse_orthologs.X.copy()

In [None]:
# Find HVGs 
sc.pp.highly_variable_genes(mouse_orthologs, batch_key = 'donor', n_top_genes = 4000)

In [None]:
mouse_orthologs.var['highly_variable'].value_counts()

## Intersect HVGs from human and mouse

In [None]:
human_hvgs = mpw_orthologs[:, mpw_orthologs.var['highly_variable']].var_names.to_list()

In [None]:
mouse_hvgs = mouse_orthologs[:, mouse_orthologs.var['highly_variable']].var_names.to_list()

In [None]:
len(human_hvgs), len(mouse_hvgs)

In [None]:
# Convert mouse genes to human 
mouse_hvgs_human = common_genes_human_mouse[common_genes_human_mouse['mmusculus_homolog_associated_gene_name'].isin(mouse_hvgs)]['external_gene_name'].to_list()

In [None]:
hvgs_tot = human_hvgs.copy()
len(hvgs_tot)

In [None]:
hvgs_tot.extend(mouse_hvgs_human)

In [None]:
len(hvgs_tot)

In [None]:
hvgs_tot = list(np.unique(hvgs_tot))

In [None]:
len(hvgs_tot)

In [None]:
path_to_data = '/nfs/team292/vl6/FetalReproductiveTract/'

In [None]:
# Open a file in write mode ('w') and write each item on a new line
with open(path_to_data + 'human_mouse_common_hvgs.txt', 'w') as f:
    for item in hvgs_tot:
        f.write("%s\n" % item)

In [None]:
inters_hvgs = [i for i in human_hvgs if i in mouse_hvgs_human]

In [None]:
len(inters_hvgs)

In [None]:
mpw_orthologs = mpw_orthologs[:, inters_hvgs]
mpw_orthologs.shape

In [None]:
mpw_orthologs.write(path_to_data + "human_genital_tubercle_mpw_orthologs.h5ad")

In [None]:
path_to_data

In [None]:
mpw_orthologs

In [None]:
inters_hvgs_mouse = common_genes_human_mouse[common_genes_human_mouse['external_gene_name'].isin(inters_hvgs)]['mmusculus_homolog_associated_gene_name'].to_list()


In [None]:
len(inters_hvgs_mouse)

In [None]:
mouse_orthologs = mouse_orthologs[:, inters_hvgs_mouse]
mouse_orthologs.shape

In [None]:
mouse_orthologs.write(path_to_data + "mouse_genital_tubercle_mpw_orthologs.h5ad")

In [None]:
# Save ortholog matching table 
feature_table = common_genes_human_mouse[common_genes_human_mouse['external_gene_name'].isin(inters_hvgs)]

In [None]:
feature_table = feature_table[['mmusculus_homolog_associated_gene_name', 'external_gene_name']]

In [None]:
feature_table.head()

In [None]:
feature_table.shape

In [None]:
feature_table.to_csv('/nfs/team292/vl6/Mouse_RepTract/Amato2021/feature_table.csv', index = False, header = False)

In [None]:
sc.pl.umap(mpw, color = ['SCGB1A1', 'SPIB', 'PTPRD', 'sex'], color_map = 'OrRd', use_raw = False)