### Sample-by-sample analysis of fetal reproductive tract 

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
import sys
import warnings
warnings.filterwarnings('ignore')


def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)


sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-samples/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

### 1. Read in the files generated by Nextflow pipeline 

In [None]:
sample = 'HD_F_GON12779691'

In [None]:
data_dir = '/nfs/team292/vl6/FetalReproductiveTract/RNA_QC_CellBender/data/'
figures_dir = '/nfs/team292/vl6/FetalReproductiveTract/RNA_QC_CellBender/figures/'

In [None]:
adata = sc.read_h5ad(data_dir + sample + '.h5ad')
if 'broad_annotations' in adata.obs.columns:
    print('ANNOTATED')
else: 
    print('NOT ANNOTATED')

adata

In [None]:
print('Median number of reads per cell: ', np.median(adata.obs['n_counts']))

In [None]:
print('Median number of genes per cell: ', np.median(adata.obs['n_genes']))

In [None]:
adata.obs['predicted_doublets'] = adata.obs['predicted_doublets'].astype(int)

In [None]:
sc.pl.umap(adata, color = ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo', 
                           'doublet_scores', 
                          ], ncols = 3, color_map = 'OrRd')

In [None]:
adata.shape

### 2. Load Souporcell results (if multiplexing)

In [None]:
souporcell_res = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/tickets/tic-1313/results7/HD_F_GON12893251/soc/clusters.tsv', 
                            sep = '\t', index_col = 0)
souporcell_res.head()

In [None]:
souporcell_res['assignment'].value_counts(dropna = False)

In [None]:
adata.obs['barcode'] = [i.split('_')[3] for i in adata.obs_names]

In [None]:
adata.obs['souporcell_classification'] = adata.obs['barcode'].map(souporcell_res['status'].to_dict())

In [None]:
adata.obs['souporcell_assignment'] = adata.obs['barcode'].map(souporcell_res['assignment'].to_dict())

In [None]:
adata.obs['souporcell_assignment'].value_counts(dropna = False)

In [None]:
cross = pd.crosstab(adata.obs['souporcell_classification'], adata.obs['souporcell_assignment'])
cross

In [None]:
adata.obs['souporcell_assignment'] = adata.obs['souporcell_assignment'].astype(str)

In [None]:
adata.obs['souporcell_assignment'] = np.where(adata.obs['souporcell_assignment'].isin(['0/1', '1/0', '2/0', 
                                                                                      '2/1', '0/2', '1/2']), 'Doublet', adata.obs['souporcell_assignment'])

In [None]:
adata.obs['souporcell_assignment'].value_counts(dropna = False)

In [None]:
adata.obs['souporcell_assignment'] = np.where(adata.obs['souporcell_assignment'] == 'nan', 'Unassigned', adata.obs['souporcell_assignment'])

In [None]:
sc.pl.umap(adata, color = ['n_genes', 'n_counts', 'percent_mito',
                           'predicted_doublets', 'doublet_scores', 'souporcell_assignment',
                          ], ncols = 3, color_map = 'OrRd', 
          save = '_'+sample+'_qc.pdf')

In [None]:
# Remove genotype doublets 
# adata = adata[[i not in ['Doublet'] for i in adata.obs['souporcell_assignment']]]
adata = adata[[i != 1 for i in adata.obs['predicted_doublets']]]
adata = adata[[i < 0.4 for i in adata.obs['doublet_scores']]]
adata.shape

### 3. Plot markers of major cell types 

In [None]:
utils_dir = '/nfs/team292/vl6/RepTract/'
sys.path.append(utils_dir)

import reptract_genes

In [None]:
reptract_genes.markers

In [None]:
for ct in reptract_genes.markers: 
    print(ct)
    markers = [i for i in reptract_genes.markers[ct] if i in adata.var_names.to_list()]
    if len(markers)> 0:
        sc.pl.umap(adata, color = markers, ncols = 3,
                   color_map = 'OrRd', use_raw = False)

In [None]:
sc.pl.umap(adata, color = [i for i in mesenchymal_markers if i in adata.var_names.to_list()], ncols = 3,
                   color_map = 'OrRd', use_raw = False)

In [None]:
print("Y chromosome markers")
markers = [i for i in reptract_genes.markers['Male_specific'] if i in adata.var_names.to_list()]
print(len(markers))
if len(markers) > 0:
    sc.pl.umap(adata, color = markers, ncols = 3,
                   color_map = 'OrRd', use_raw = False)

#### 4. Cluster manifold

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, frameon=True, 
                     vector_friendly=True, fontsize=14, figsize=(7,7), color_map=None, 
                     format='pdf', facecolor=None, transparent=False, ipython_format='png2x')

In [None]:
sc.tl.leiden(adata, resolution = 1, random_state = 1996)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, frameon=True, 
                     vector_friendly=True, fontsize=14, figsize=(8,6), color_map=None, 
                     format='pdf', facecolor=None, transparent=False, ipython_format='png2x')


In [None]:
sc.pl.violin(adata, keys = ['percent_mito'], groupby = 'leiden', rotation = 90)

In [None]:
sc.pl.violin(adata, keys = ['n_genes'], groupby = 'leiden', rotation = 90)

In [None]:
sc.pl.violin(adata, keys = ['doublet_scores'], groupby = 'leiden', rotation = 90)

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data', groups = [ '25', '26', '27'])

In [None]:
# # Remove contaminants from the gonads and doublets
adata = adata[[i not in ['25', '26', '27'] for i in adata.obs['leiden']]]

In [None]:
adata.shape

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, frameon=True, 
                     vector_friendly=True, fontsize=14, figsize=(7,7), color_map=None, 
                     format='pdf', facecolor=None, transparent=False, ipython_format='png2x')


In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.tl.leiden(adata, resolution = 1)
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.tl.leiden(adata, resolution = 0.1, restrict_to = ('leiden', ['18']))
sc.pl.umap(adata, color = 'leiden_R')

#### 5. Find marker genes per cluster with TF-IDF

In [None]:
import anndata
bdata = anndata.AnnData(X = adata.raw.X, var = adata.raw.var, obs = adata.obs)

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
for col in bdata.obs.columns: 
    if col != 'leiden_R':
        del bdata.obs[col]

In [None]:
bdata

In [None]:
%%R -i bdata
bdata

In [None]:
%%R -o mrks

library(SoupX)
counts <- assay(bdata, "X")
colnames(counts) <- colnames(bdata)
rownames(counts) <- rownames(bdata)
mrks = quickMarkers(counts, colData(bdata)$leiden_R, N = 50) # where clusters is the cell type assignment and 100 means I want the top 100 genes per cluster that pass the hypergeometric test}


In [None]:
mrks.tail()

In [None]:
np.unique(mrks['cluster'])

In [None]:
useful = ['PITX1', 'THBS2', 'SPON2', 'PRDM8', 'PAX8', 'GATA4', 'GATA2', 'LHX9',
          'HTR2B', 'EMX2', 'SHH', 'DLX2', 'DLX5', 'DNAH12', 'C7', 'HOXB13', 'SNX31']

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
cluster = mrks[mrks['cluster'] == '15']['gene'][0:15]
sc.pl.umap(adata, color = [i for i in cluster if i in adata.var_names.to_list()],
           ncols = 4, color_map = 'OrRd', use_raw = False)

In [None]:
random_markers = ['SALL1', 'MYH11', 'TP63', 'DLX5', 'SOX9', 'SHOX', 'DNAH12', 'PDZK1IP1', 'SP7', 'WIF1', 'NOTUM', 'NKD1', 'PSCA', 'KRTDAP', 'MFAP5', 'SFRP5', 'TCF21', 'FOXF1', 'ACTC1', 'OLFM3', 'PCOLCE2', 'SIX2', 'FOXD1', 'SIX1', 'NPHS1', 'NPHS2', 'AXIN2', 'EMX2', 'MYH11', 'GATA4', 'WIF1', 'SRD5A2', 'RGS5', 'BCAN', 'SP7', 'HOXC10', 'SERPINB7', 'CDH7', 'ASPN',
           'GATA2', 'LHX9', 'NR5A1', 'SOX9', 'WNT6', 'SHISA3', 'PLAC1', 'ADH1B', 'HTR2B', 'HOXC5', 
                           'COMP', 'CRLF1', 'CALY', 'ITGA4', 'SEMA3A', 'SFRP5', 'PRDM8', 'TMTC1', 
                           'TBX5', 'TBX4', 'ITGBL1', 'LGR5', 'TSPAN8', 
                           'EMX2','UPK3B', 'TBX18', 'MSC', 'ACTA2', 'RGS5', 'PITX2', 'PTGER3', 'RAI2',
                          'DES', 'PDGFRA', 'DCN', 'HOXB13', 'SNX31', 'TP63', 'FOXA1', 'DNAH12', 'PITX2', 
          'WIF1', 'SERPINB7', 'RORB', 'HOXA10', 'HOXA11', 'CTCN1', 'AMHR2', 'MMP28', 'ETV4', 'GRIA4', 'TMEM163', 'ZAP70', 'RUNX2', 
          'PAPP2', 'BRINP1', 'PTGES', 'ANO4', 'AR', 'HOXA13','ISL1','HOXD13', 'DLX5', 'DLX6', 'DNAH12', 
          'HSD17B2', 'HSD17B3', 'HSD17B7', 'KRT17', 'KRT14', 'KRT13', 'KRT5', 'HOXA9', 'HOXD9', 'HOXA10', 'HOXD10', 
          'HOXA11', 'HOXD11', 'HOXA7', 'HOXC6', 'HOXC8', 'AR', 'KLK3', 'SHBG', 'TMPRSS2', 'FKBP5', 'NKX3-1', 'NCOA3', 'HHIP', 'RDH10', 'THBD', 'ALX4', 'SHOX2', 'PRDM8', 'IRX1']

In [None]:
sc.pl.umap(adata, color = [i for i in random_markers if i in adata.var_names], ncols = 4, color_map = 'OrRd', use_raw = False)

#### 6. Annotate clusters

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, color = 'leiden_R', legend_loc = 'on data')

In [None]:
adata.shape 

In [None]:
annots = {'0' : 'Vaginal_SmoothMuscle', '1' : 'Uterus_Ligament', '2' : 'Uterus_Ligament', 
          '3' : 'Perivascular',  
          '4' : 'Endothelial', '5' : 'Pre-Perivascular', '6' : 'Uterus_Ligament', '7' : 'Vaginal_SmoothMuscle', 
          '8' : 'Perivascular',
          '9' : 'Uterus_Ligament',
          '10' : 'Uterus_SmoothMuscle', '11' : 'SchwannCell',
          '12,0' : 'CorpusSpongiosum',  '12,1' : 'Vaginal_Mesenchyme_SFRP5', '12,2' : 'CorpusSpongiosum', '13' : 'Uterus_Mesenchyme', 
          '14' : 'Perivascular', '15,0' : 'Genital_Epidermis', '15,1' : 'KRTDAP_Epithelium', 
          '16' : 'Uterus_SmoothMuscle',
          '17' : 'CoelomicEpithelium',
          '18,0' : 'VaginalPlate_Epithelium', '18,1' : 'Urothelium_UPKs', 
          '18,2' : 'MüllerianVagina_Epithelium', 
          '19' : 'Endothelial',
         '20' : 'Vaginal_SmoothMuscle', 
          '21' : 'Immune', '22' : 'Immune', 
          '23' : 'Endothelial_Lymphatic', '24' : 'Neural',  '25' : 'Coelomic_Epithelium', '26' : 'Endothelial_Lymphatic', 
          '27' : 'SchwannCell', '28' : 'Endothelial', '29' : 'Immune', '30' : 'CorpusSpongiosum',
          '31' : 'Immune', '32' : 'Coelomic_Epithelium', '33' : 'WolffianDuct_Mesenchyme', '34' : 'Kidney_Podocyte', 
          '35' : 'VasDeferens_Epithelium', '36' : 'SchwannCell', '37' : 'MüllerianDuct_Epithelium', '42' : 'Kidney_UretericBud', 
          '39' : 'Immune', '41' : 'Endothelial', '43' : 'Endothelial', '40' : 'Kidney_SShapedBody',
         } 
adata.obs['broad_annotations'] = adata.obs.leiden_R.map(annots)
sc.pl.umap(adata, color = ['broad_annotations'])


In [None]:
sc.pl.umap(adata, color = ['n_genes', 'percent_mito', 'doublet_scores', 'broad_annotations', 
                           'leiden', 
                           'souporcell_assignment'
                          ], 
           ncols = 2, color_map = 'OrRd')

### 5. Save annotated h5ad object

In [None]:
sc.pl.umap(adata, color = 'souporcell_assignment')

In [None]:
adata = adata[[i != 'Unassigned' for i in adata.obs['souporcell_assignment']]]

In [None]:
donor_dict = {'0' : 'Hrv152', '1' : 'Hrv139'}
adata.obs['donor'] = adata.obs['souporcell_assignment'].map(donor_dict)

In [None]:
sc.pl.umap(adata, color = 'donor')

In [None]:
Hrv152 = adata[[i == 'Hrv152' for i in adata.obs['donor']]]
Hrv152.shape

In [None]:
Hrv139 = adata[[i == 'Hrv139' for i in adata.obs['donor']]]
Hrv139.shape

In [None]:
data_dir = '/nfs/team292/vl6/FetalReproductiveTract/RNA_QC_CellBender/data/'

In [None]:
sample

In [None]:
adata.write(data_dir + sample + '.h5ad')

In [None]:
Hrv139.write(data_dir + sample + '_Hrv139' + '.h5ad')

In [None]:
Hrv152.write(data_dir + sample + '_Hrv152' + '.h5ad')