In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import os
import sys
import scipy


def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)


sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './genital_tubercle_figures/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

In [None]:
path_to_data = '/nfs/team292/vl6/Mouse_RepTract/Amato2021/'

In [None]:
metadata = {'sample' : ['GSM5324563', 'GSM5324564', 'GSM5324565', 'GSM5324566', 
                       'GSM5324567', 'GSM5324568', 'GSM5324569', 'GSM5324570', 
                       'GSM5324571', 'GSM5324572', 'GSM5324573', 'GSM5324574'],
            'long_sample' : ['GSM5324563_E14.5_male_genitalia_1', 'GSM5324564_E14.5_male_genitalia_2', 
                            'GSM5324565_E14.5_female_genitalia_1', 'GSM5324566_E14.5_female_genitalia_2', 
                            'GSM5324567_E16.5_male_genitalia_1', 'GSM5324568_E16.5_male_genitalia_2', 
                            'GSM5324569_E16.5_female_genitalia_1', 'GSM5324570_E16.5_female_genitalia_2', 
                            'GSM5324571_E18.5_male_genitalia_1', 'GSM5324572_E18.5_male_genitalia_2', 
                            'GSM5324573_E18.5_female_genitalia_1', 'GSM5324574_E18.5_female_genitalia_2'],
           'donor' : ['donor1', 'donor2', 'donor3', 'donor4', 'donor5', 'donor6', 'donor7', 'donor8', 
                     'donor9', 'donor10', 'donor11', 'donor12'], 
           'age' : [14.5, 14.5, 14.5, 14.5, 16.5, 16.5, 16.5, 16.5, 18.5, 18.5, 18.5, 18.5], 
           'sex' : ['male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
                   'male', 'male', 'female', 'female',], 
           'genotype' : ['Sf1cre_Rosa-tdTomato_Runx1-eGFP', 'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 
                        'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 
                        'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 
                        'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 
                        'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 
                        'Sf1cre_Rosa-tdTomato_Runx1-eGFP', 'Sf1cre_Rosa-tdTomato_Runx1-eGFP']}
metadata = pd.DataFrame.from_dict(metadata)
metadata = metadata.set_index('long_sample')
metadata

In [None]:
## import reproductive tract utils functions
cwd = '/nfs/team292/vl6/Experiments/Utils'
sys.path.append(cwd)

import reptract_utils
import reptract_genes

In [None]:
holder = []
for sample in metadata.index:
    print(sample)
    # Load 10x data as AnnData
    holder.append(sc.read_10x_mtx(path_to_data + sample, prefix = sample + '_', cache=True))
    
    # Set names of observation as sample + _ + barcode/probe
    holder[-1].obs_names = [sample+'_'+i.split('-')[0] for i in holder[-1].obs_names]
    # Filer genes expressed in less than 3 cells
    sc.pp.filter_genes(holder[-1], min_cells=3)
    # Filer cells with less than 10 genes expressed
    sc.pp.filter_cells(holder[-1], min_genes=500)
    sc.pp.filter_cells(holder[-1], max_genes=10000)
    # add in metadata
    holder[-1].obs['sample'] = sample
    for val in metadata.columns:
        holder[-1].obs[val] = metadata[val][sample]
    # Extract mitochondial genes
    mito_genes = [name for name in holder[-1].var_names if name.startswith('mt-')]
    #for each cell compute fraction of counts in mito genes vs. all genes
    #the `.A1` is only necessary, as X is sparse - it transform to a dense array after summing
    holder[-1].obs['percent_mito'] = np.sum(
        holder[-1][:, mito_genes].X, axis=1) / np.sum(holder[-1].X, axis=1)
    #add the total counts per cell as observations-annotation to adata
    holder[-1].obs['n_counts'] = holder[-1].X.sum(axis=1)
    print('Total number of cells: {:d}'.format(holder[-1].n_obs))
    print('Total number of genes: {:d}'.format(holder[-1].n_vars))
    # Make diagnostic plots for per-sample quality 
    reptract_utils.qc_plots_sample(holder[-1], sample, '/home/jovyan/RepTract/GENITAL TUBERCLE/genital_tubercle_figures/')

## Preprocessing

In [None]:
for h5 in holder:
    print(h5.obs['sample'][0], h5.shape)

In [None]:
# Filtering 
holder_filtered = []
for adata in holder:
    sc.pp.filter_cells(adata, min_genes = 1500) 
    sc.pp.filter_genes(adata, min_cells = 3)
    adata = adata[adata.obs['percent_mito'] < 0.1, :]
    holder_filtered.append(adata)

In [None]:
# confirm N samples
print(len(holder_filtered))
# merge datasets
adata = holder_filtered[0].concatenate(holder_filtered[1:],join='outer',index_unique=None, fill_value=0)
# copy of this matrix in Compressed Sparse Row format
import scipy
adata.X = scipy.sparse.csr_matrix(adata.X)
adata

In [None]:
adata.X.toarray()[1:10, 1:10]

In [None]:
adata.obs.head()

In [None]:
metadata

In [None]:
barcode_equivalence = {'Male1' : 'GSM5324563_E14.5_male_genitalia_1', 'Female1' : 'GSM5324565_E14.5_female_genitalia_1', 
'Male2' : 'GSM5324564_E14.5_male_genitalia_2', 'Female2' : 'GSM5324566_E14.5_female_genitalia_2', 
'Male3' : 'GSM5324567_E16.5_male_genitalia_1', 'Female3' : ' GSM5324569_E16.5_female_genitalia_1', 
'Male4' : 'GSM5324568_E16.5_male_genitalia_2', 'Female4' : 'GSM5324570_E16.5_female_genitalia_2', 
'Male5' : 'GSM5324571_E18.5_male_genitalia_1', 'Female5' : 'GSM5324573_E18.5_female_genitalia_1', 
'Male6' : 'GSM5324572_E18.5_male_genitalia_2', 'Female6' : 'GSM5324574_E18.5_female_genitalia_2'}


In [None]:
barcode_equivalence

### Load cell barcode to cell type correspondence shared by Ciro Amato

In [None]:
def load_barcode_celltype_correspondence(filename, equivalence_dict):
    barcodes = pd.read_csv('/home/jovyan/RepTract/GENITAL TUBERCLE/' + filename, index_col = 0)
    barcodes['sample_number'] = [i.split('_')[0] for i in barcodes['cell.barcode']]
    barcodes['cell_barcode'] =  [i.split('_')[1] for i in barcodes['cell.barcode']]
    barcodes['cell_barcode'] = [i.split('-')[0] for i in barcodes['cell_barcode']]
    barcodes['sample'] = barcodes['sample_number'].map(equivalence_dict)
    barcodes['cell.barcode.new'] = barcodes['sample'] + '_' + barcodes['cell_barcode']
    barcodes = barcodes.set_index('cell.barcode.new')
    print(barcodes.shape)
    return barcodes

In [None]:
male_barcodes = load_barcode_celltype_correspondence('Amato2021_male_barcode.csv', barcode_equivalence)
female_barcodes = load_barcode_celltype_correspondence('Amato2021_female_barcode.csv', barcode_equivalence)

In [None]:
# Convert lists to sets and use the intersection method
common_female_barcodes = list(set(female_barcodes.index.to_list()).intersection(set(adata.obs_names.to_list())))
common_male_barcodes = list(set(male_barcodes.index.to_list()).intersection(set(adata.obs_names.to_list())))
print(len(common_female_barcodes), len(common_male_barcodes))

In [None]:
tot_barcodes = male_barcodes.append(female_barcodes)

In [None]:
tot_barcodes.head()

In [None]:
adata.obs['celltype'] = adata.obs_names.map(tot_barcodes['cluster.name'].to_dict())

In [None]:
adata.obs['celltype'].value_counts(dropna = False)

In [None]:
adata.obs['celltype'] = adata.obs['celltype'].astype(str)

In [None]:
adata = adata[[i != 'nan' for i in adata.obs['celltype']]]

In [None]:
adata.obs['celltype'].value_counts(dropna = False)

In [None]:
adata.raw = adata.copy()

In [None]:
# Normalize and log-transform 
adata = reptract_utils.normalize_log_transform(adata)

In [None]:
# HVGs, PCA, KNN & UMAP
adata = reptract_utils.hvgs_pca_umap(adata)

In [None]:
## Eliminate small clusters of doublets 
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[7,7], color_map=None, 
                         format='pdf', facecolor=None, transparent=False,)

In [None]:
sc.pl.umap(adata, color = ['donor', 'age', 'sex', 'sample', 'percent_mito', 
                          'n_genes', 'celltype'], color_map = 'OrRd', ncols = 2)

In [None]:
# Cell cycle scoring
cell_cycle_genes = [x.strip() for x in open(file='/nfs/users/nfs_v/vl6/regev_lab_cell_cycle_genes.txt')]
s_genes = cell_cycle_genes[:43]
s_genes = [i.title() for i in s_genes]
print(s_genes)

In [None]:
g2m_genes = cell_cycle_genes[43:]
g2m_genes = [i.title() for i in g2m_genes]
s_genes = [x for x in s_genes if x in adata.var_names]
g2m_genes = [x for x in g2m_genes if x in adata.var_names]

In [None]:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes, random_state = 123)

In [None]:
sc.pl.umap(adata, color = 'phase')

In [None]:
adata.obs['celltype'] = adata.obs['celltype'].astype('str')

In [None]:
for i in np.unique(adata.obs['celltype'].to_list()):
    print(i)
    sc.pl.umap(adata, color = ['celltype'], groups = str(i))

In [None]:
sc.pl.umap(adata, color = ['Foxa1', 'Hoxa13', 'Nr2f2', 'Krt10', 'Isl1', 'Krt14', 
                           'Krt5', 'Upk1a', 'Foxf1', 'Foxf2', 'Sox9', 'Lgr5', 
                          'Wif1', 'Shh', 'Dlx5', 'Wnt2', 'Rdh10', 'Six1', 'Igf1', 'Irx1', 
                          'Irx5', 'Inhba', 'Gap43', 'Shox2', 'Cpa6', 'Wt1', 'Sall1', 'Foxl2', 
                          'Tcf21', 'Cdh8', 'Krtdap', 'Wnt3', 'Dhrs2', 'Upk1a', 'Upk3a'], color_map = 'OrRd', ncols = 3, use_raw = False)

In [None]:
np.unique(adata.obs['celltype'])

In [None]:
# Keep only epithelial and mesenchymal cells in G1
adata = adata[[i not in ['Smooth muscle', 'Smooth Muscle', 'Macrophage', 'Melanocyte', 'Endothelium', 
                        'Proliferative Mesenchyme'] for i in adata.obs['celltype']]]
adata = adata[[i == 'G1' for i in adata.obs['phase']]]
adata.shape

In [None]:
# Harmonise annotations where there are duplicates 
adata.obs['celltype'] = np.where(adata.obs['celltype'] == 'Sub-dermal Prepuce', 'Sub-Dermal Prepuce',
                                adata.obs['celltype'])

In [None]:
sc.pl.umap(adata, color = 'celltype')

In [None]:
sc.pl.umap(adata, color = 'donor')

## TF-IDF markers per cell type

In [None]:
import anndata
bdata = anndata.AnnData(X = adata.raw.X, var = adata.raw.var, obs = adata.obs)
bdata.shape

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
%%R -i bdata
bdata

In [None]:
%%R -o mrks

library(SoupX)
counts <- assay(bdata, "X")
colnames(counts) <- colnames(bdata)
rownames(counts) <- rownames(bdata)
mrks = quickMarkers(counts, colData(bdata)$celltype, N = 50) # where clusters is the cell type assignment and 100 means I want the top 100 genes per cluster that pass the hypergeometric test

In [None]:
cluster = mrks[mrks['cluster'] == 'Preputial gland']['gene'].to_list()[0:15]
sc.pl.umap(adata, color = cluster, color_map = 'OrRd', ncols = 3, use_raw = False)

Markers for preputial gland are clearly immune markers, so remove the cell type and discuss in Methods the rationale for this

In [None]:
adata = adata[[i not in ['Preputial gland'] for i in adata.obs['celltype']]]

In [None]:
adata.shape

In [None]:
np.sum(adata.var['gene_ids-0'].isna())

In [None]:
path_to_data

## Re-analyse mesenchymal and epithelial cells

In [None]:
import anndata
adata = anndata.AnnData(X = adata.raw.X, var = adata.raw.var, obs = adata.obs)
adata.shape

In [None]:
sc.pp.filter_genes(adata, min_counts = 10)

In [None]:
adata.raw = adata.copy()

In [None]:
adata = reptract_utils.normalize_log_transform(adata)

In [None]:
adata = reptract_utils.hvgs_pca_umap(adata)

In [None]:
sc.pl.umap(adata, color = ['donor', 'age', 'sex', 'sample', 'percent_mito', 
                          'n_genes', 'celltype'], color_map = 'OrRd', ncols = 2)

## Harmony integration

In [None]:
# Extract PCA matrix and batch array
n_pcs = 15 # Principal components used to perform dimensionality reduction 
pca = adata.obsm['X_pca'][:, 0:(n_pcs)]
batch = adata.obs['donor'] # Batch ID, in this case stage

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
%%R -i pca -i batch -o hem

library(harmony)
library(magrittr)
set.seed(1000)
hem <- HarmonyMatrix(pca, batch, theta=0, lambda = 6, do_pca=FALSE, verbose = FALSE)  
hem = data.frame(hem)

In [None]:
# Add harmony values to the anndata object --> overwriting PCA 
adata.obsm['X_pca_harmony'] = hem.values 

In [None]:
sc.pp.neighbors(adata, n_pcs = n_pcs, use_rep = 'X_pca_harmony', random_state= 123)
sc.tl.umap(adata, min_dist=0.6, spread = 0.8)

In [None]:
sc.pl.umap(adata, color = ['donor', 'age', 'sex', 'sample', 'percent_mito', 
                          'n_genes', 'celltype'], color_map = 'OrRd', ncols = 2)

In [None]:
sc.pl.umap(adata, color = ['Foxa1', 'Hoxa13', 'Nr2f2', 'Krt10', 'Isl1', 'Krt14', 
                           'Krt5', 'Upk1a', 'Foxf1', 'Foxf2', 'Sox9', 'Lgr5', 
                          'Wif1', 'Shh', 'Dlx5', 'Wnt2', 'Rdh10', 'Six1', 'Igf1', 'Irx1', 
                          'Irx5', 'Inhba', 'Gap43', 'Shox2', 'Cpa6', 'Sall1', 'Foxl2', 'Tcf21'], color_map = 'OrRd', ncols = 3, use_raw = False)

In [None]:
sc.pl.umap(adata, color = ['Epcam', 'Cdh5', 'Lyve1', 'Cd68', 'Stmn2', 'Plp1', 'Rgs5', 'Myh11', 
                          ], color_map = 'OrRd', ncols = 3, use_raw = False)

In [None]:
for c in list(np.unique(adata.obs['celltype'])):
    sc.pl.umap(adata, color = ['celltype'], groups = c)

In [None]:
sc.tl.leiden(adata, resolution = 0.5)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.tl.leiden(adata, resolution = 0.2, restrict_to = ('leiden', ['0']))
sc.pl.umap(adata, color = 'leiden_R', legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, color = ['leiden_R', 'celltype'], legend_loc = 'on data')

In [None]:
np.unique(adata.obs['celltype'])

In [None]:
# Harmonise annotations based on clustering 
harmonised_annots = {
    '0,0' : 'Distal Dorsal Glanular Mesenchyme',
    '0,1' : 'Late Proximal Glanular Mesenchyme',
    '1' : 'Preputial Mesenchyme',
    '2' : 'Preputial Mesenchyme',
    '3' : 'Corpus Cavernsoum', 
    '4' : 'Ventral Glanular Mesenchyme', 
    '5' : 'Late Proximal Glanular Mesenchyme', 
    '6' : 'Late Proximal Glanular Mesenchyme', 
    '7' : 'Distal Dorsal Glanular Mesenchyme', 
    '8' : 'Early Proximal Glanular Mesenchyme',
    '9' : 'Ventral Glanular Mesenchyme', 
    '10' : 'Sub-Dermal Prepuce',
    '11' : 'Early Proximal Glanular Mesenchyme', 
    '12' : 'Epithelium'
}
adata.obs['harmonised_celltype'] = adata.obs['leiden_R'].map(harmonised_annots)

In [None]:
sc.pl.umap(adata, color = 'harmonised_celltype')

In [None]:
adata.write('/nfs/team292/vl6/Mouse_RepTract/Amato2021/Amato2021_mese_epi_annotated.h5ad')

In [None]:
np.unique(adata.obs['harmonised_celltype'])

In [None]:
adata.obs['harmonised_celltype'] = adata.obs['harmonised_celltype'].astype('category')
adata.obs['harmonised_celltype'] = adata.obs['harmonised_celltype'].cat.reorder_categories([
     'Epithelium', 'Distal Dorsal Glanular Mesenchyme', 'Ventral Glanular Mesenchyme', 
    'Early Proximal Glanular Mesenchyme', 'Late Proximal Glanular Mesenchyme', 
    'Corpus Cavernsoum', 'Preputial Mesenchyme',
       'Sub-Dermal Prepuce',
])

In [None]:
color_dict = {
    'Epithelium' : 'mediumorchid', 'Distal Dorsal Glanular Mesenchyme' : 'navy', 
    'Ventral Glanular Mesenchyme' : 'cornflowerblue', 
    'Early Proximal Glanular Mesenchyme' : 'deepskyblue', 'Late Proximal Glanular Mesenchyme' : 'darkturquoise', 
    'Corpus Cavernsoum' : 'mediumspringgreen', 'Preputial Mesenchyme' : 'yellowgreen',
       'Sub-Dermal Prepuce' : 'olivedrab',
}

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
sc.pl.umap(adata, color = 'harmonised_celltype', palette = color_dict, 
          save = '_amato2021_celltype.pdf')

In [None]:
sc.pl.umap(adata, color = 'donor', 
          save = '_amato2021_donor.pdf')

In [None]:
sc.pl.umap(adata, color = 'age', 
          save = '_amato2021_stage.pdf')