In [None]:
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
sc.set_figure_params(facecolor="white", figsize=(8, 8), dpi_save=300)
sc.settings.verbosity = 1
sc.settings.figdir = './invivo/spatial/'

In [None]:
path = '/nfs/team292/lg18/cell2location/'

# Dictionaries
Vsamples = {'proliferative':["152810", "152806"],
           'secretory':["152807", "152811"],
           'all':["152807", "152810"]}

SCclusters = {'secretory':['Endothelial ACKR1',
                             'Endothelial SEMA3G',
                             'Epithelial Ciliated',
                             'Epithelial Glandular_secretory',
                             'Epithelial Lumenal 1',
                             'Epithelial SOX9',
                             'Fibroblast C7',
                             'Fibroblast dS',
                             'Lymphoid',
                             'Myeloid',
                             'PV MYH11',
                             'PV STEAP4',
                             'uSMC'],
           'proliferative':['Endothelial ACKR1',
                             'Endothelial SEMA3G',
                             'Epithelial Ciliated',
                             'Epithelial Pre-ciliated',
                             'Epithelial SOX9',
                             'Epithelial SOX9_prolif',
                             'Epithelial SOX9_LGR5',
                             'Fibroblast C7',
                             'Fibroblast eS',
                             'Lymphoid',
                             'Myeloid',
                             'PV MYH11',
                             'PV STEAP4',
                             'uSMC'], 
              'all':['Endothelial ACKR1',
                             'Endothelial SEMA3G',
                             'Epithelial Glandular_secretory',
                             'Epithelial Lumenal 1',
                             'Epithelial Ciliated',
                             'Epithelial Pre-ciliated',
                             'Epithelial SOX9',
                             'Epithelial SOX9_LGR5',
                             'Epithelial SOX9_prolif',
                             'Fibroblast C7',
                             'Fibroblast eS',
                             'Fibroblast dS',
                             'Lymphoid',
                             'Myeloid',
                             'PV MYH11',
                             'PV STEAP4',
                             'uSMC']}

# Reading the reference scRNA dataset


In [4]:
adata = sc.read('/lustre/scratch117/cellgen/team292/lh20/1gpunobooks/primary/N4-integrated_donors.h5ad')
adata.X.shape

KeyboardInterrupt: 

## Set to raw counts

In [None]:
adata = an.AnnData(X = adata.raw.X, var = adata.raw.var,  obs = adata.obs)

### Add cell type annotations from subanalysis as LABELS

In [None]:
## Add cell type annotations as LABELS
adata.obs['labels'] = adata.obs['fine_celltypes'].tolist() 
set(adata.obs['labels'])

In [None]:
# Exclude these clusters
adata = adata[adata.obs['labels'] != 'filtered']
set(adata.obs['labels'])

### Use Ensembl id as GENE 

In [None]:
adata.var['SYMBOL'] = adata.var_names
adata.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True)
adata.var.head()
adata.var_names = adata.var['ENSEMBL']
adata.var.drop(columns='ENSEMBL', inplace=True)
adata.var.head()

### Slice dataset per individual

In [None]:
for sam in Vsamples.keys():
    print(sam)
    andata_f = adata[[cl in SCclusters[sam] for cl in adata.obs['labels']]]
    freqs = andata_f.obs.groupby('labels').size()     

    # Remove cells and genes with 0 counts everywhere
    sc.pp.filter_cells(andata_f, min_genes=5)
    sc.pp.filter_genes(andata_f, min_cells=3)

    # calculate the mean of each gene across non-zero cells
    andata_f.var['n_cells'] = (andata_f.X.toarray() > 0).sum(0)
    andata_f.var['nonz_mean'] = andata_f.X.toarray().sum(0) / andata_f.var['n_cells']
    plt.hist2d(np.log10(andata_f.var['nonz_mean']),
               np.log10(andata_f.var['n_cells']), bins=100,
               norm=mpl.colors.LogNorm(),
               range=[[0,0.5], [1,4.5]]);
    nonz_mean_cutoff = 0.05
    cell_count_cutoff = np.log10(andata_f.shape[0] * 0.0001)
    cell_count_cutoff2 = np.log10(andata_f.shape[0] * 0.01)
    plt.vlines(nonz_mean_cutoff, cell_count_cutoff, cell_count_cutoff2, color='red');
    plt.hlines(cell_count_cutoff, nonz_mean_cutoff, 1, color='red');
    plt.hlines(cell_count_cutoff2, 0, nonz_mean_cutoff, color='red');

    # select genes based on mean expression in non-zero cells
    andata_ff = andata_f[:,(np.array(np.log10(andata_f.var['nonz_mean']) > nonz_mean_cutoff)
                            | np.array(np.log10(andata_f.var['n_cells']) > cell_count_cutoff2))
                             & np.array(np.log10(andata_f.var['n_cells']) > cell_count_cutoff)
                             & np.array(~andata_f.var.index.isna())]

    os.system('mkdir -p '+ path )  
    andata_ff.write(path + '/'+ sam + '/'+ sam + '_scRNAseq.h5ad')
    
    freqs = andata_ff.obs.groupby('labels').size()
    print(freqs)

# Reading Visium data


The function read_visium returns an AnnData object that contains counts, images and spatial coordinates. We will calculate standards QC metrics with pp.calculate_qc_metrics and visualize them.

An anndata will be saved for cell2location.

In [None]:
def read_and_qc(sample_name, path):

    adata = sc.read_visium(path + str(sample_name),
                           count_file='filtered_feature_bc_matrix.h5', load_images=True)
    adata.obs['sample'] = sample_name
    adata.var['SYMBOL'] = adata.var_names
    adata.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True)
    adata.var_names = adata.var['ENSEMBL']
    adata.var.drop(columns='ENSEMBL', inplace=True)

    # Calculate QC metrics
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.var['mt'] = [gene.startswith('MT-') for gene in adata.var['SYMBOL']]
    adata.obs['mt_frac'] = adata[:, adata.var['mt'].tolist()].X.sum(1).A.squeeze()/adata.obs['total_counts']

    # mitochondria-encoded (MT) genes should be removed for spatial mapping
    adata.obsm['mt'] = adata[:, adata.var['mt'].values].X.toarray()
    adata = adata[:, ~adata.var['mt'].values]

    # add sample name to obs names
    adata.obs["sample"] = [str(i) for i in adata.obs['sample']]
    adata.obs_names = adata.obs["sample"] \
                          + '_' + adata.obs_names
    adata.obs.index.name = 'spot_id'

    return adata


def select_slide(adata, s, s_col='sample'):
    r""" Select data for one slide from the spatial anndata object.

    :param adata: Anndata object with multiple spatial samples
    :param s: name of selected sample
    :param s_col: column in adata.obs listing sample name for each location
    """

    slide = adata[adata.obs[s_col].isin([s]), :]
    s_keys = list(slide.uns['spatial'].keys())
    s_spatial = np.array(s_keys)[[s in k for k in s_keys]][0]

    slide.uns['spatial'] = {s_spatial: slide.uns['spatial'][s_spatial]}

    return slide

In [None]:
visium_path = '/nfs/users/nfs_l/lg18/team292/lg18/cell2location/input/'

for sam in Vsamples.keys():
    # read first sample
    adata = read_and_qc(Vsamples[sam][0], path=visium_path)
#     # read the remaining samples
    slides = {}
    for i in Vsamples[sam][1:]:
        adata_1 = read_and_qc(i, path=visium_path)
        slides[str(i)] = adata_1

    # combine individual samples
    adata = adata.concatenate(
        list(slides.values()),
        batch_key="sample",
        uns_merge="unique",
        batch_categories=Vsamples[sam],
        index_unique=None
    )
    adata.var_names_make_unique()
    
#     # merging metadata
#     adata.obs['sample'] = Vsamples[sam][0]
#     adata.obs['sample_id'] = sam

    
    sc.pp.filter_cells(adata, min_genes=500)
    sc.pp.filter_genes(adata, min_cells=10)
    os.system('mkdir -p '+ path+'/'+ sam +'/')  
    adata.write(path + '/' + sam + '/'+ sam + '_visium_with_spatial.h5ad')
    print(adata.X.shape)
    
    # keep only endometrium
    visium_annot_path = '/lustre/scratch117/cellgen/team292/aa22/with_Luz/region_spot_annotation_endometrium_myometrium/'
    Endo_barcodes = []
    for vi in Vsamples[sam]:
        meta = pd.read_csv(visium_annot_path+'/region_annotation_endo_myo_ID_'+vi+'.csv')
        meta = meta[meta.endo_myom_regions.eq('endometrium')]
        meta['barcodes'] = [str(vi) + '_' +str(i) for i in meta['index']]
        Endo_barcodes = Endo_barcodes + meta['barcodes'].tolist()
    
    adata = adata[[ i in Endo_barcodes for i in adata.obs.index ]]
    sc.pp.filter_cells(adata, min_genes=500)
    sc.pp.filter_genes(adata, min_cells=10)
    adata.write(path + '/' + sam + '/'+ sam + '_visium_nomyometrium.h5ad')
    print(adata.X.shape)  

In [None]:
# ## Anna's way to read old visiums

# visium_data_path = '/lustre/scratch117/cellgen/team292/aa22/endometrium_Visium_data_2019/new_data_20191209/Sanger_Visium_Beta_Phase_II_Manual_Align/'

# # images 152806 and 152810 are from donor A13 and are from Proliferative stage of the menstrual cycle
# # images 152807 and 152811 are from donor A30 and are from Secretory stage of the menstrual cycle
# filtered_matrix_paths_P = visium_data_path + '152810/152810_manual_filtered_feature_bc_matrix.h5' 
# filtered_matrix_paths_S = visium_data_path + '152807/152807_manual_filtered_feature_bc_matrix.h5' 
# spot_coord_paths_P = visium_data_path + '152810/152810_manual_spatial__tissue_positions_list.csv'
# spot_coord_paths_S = visium_data_path + '152807/152807_manual_spatial__tissue_positions_list.csv'

# # Proliferative
# adatas_visium = sc.read_10x_h5(filtered_matrix_paths_P)
# adatas_visium.var_names_make_unique()
# # reading spot coordinates
# spot_coord = pd.read_csv(spot_coord_paths_P, header=None)
# spot_coord.columns = ["barcode","tissue","row","col","imagerow","imagecol"]
# spot_coord.set_index('barcode', inplace=True)
# # adding spot coordinates to adata object
# adatas_visium.obs['spot_id'] = adatas_visium.obs.index
# # subsetting to only spots in adata aka filtered (under tissue)
# spot_coord = spot_coord.loc[list(adatas_visium.obs['spot_id']),:]
# print('new shape of spot_coord', spot_coord.shape)
# X_spatial = [[spot_coord['imagecol'][j],spot_coord['imagerow'][j]] for j in range(len(spot_coord))]
# adatas_visium.obsm['spatial'] = np.array(X_spatial)
# adatas_visium.obs['sample'] = 'proliferative'
# adatas_visium.obs['sample_id'] = '152810'
# # gene names
# adatas_visium.var['SYMBOL'] = adatas_visium.var_names
# adatas_visium.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True)
# adatas_visium.var_names = adatas_visium.var['ENSEMBL']
# adatas_visium.var.drop(columns='ENSEMBL', inplace=True)
# # Calculate QC metrics
# sc.pp.calculate_qc_metrics(adatas_visium, inplace=True)
# adatas_visium.var['mt'] = [gene.startswith('MT-') for gene in adatas_visium.var['SYMBOL']]
# adatas_visium.obs['mt_frac'] = adatas_visium[:, adatas_visium.var['mt'].tolist()].X.sum(1).A.squeeze()/adatas_visium.obs['total_counts']
# # mitochondria-encoded (MT) genes should be removed for spatial mapping
# adatas_visium.obsm['mt'] = adatas_visium[:, adatas_visium.var['mt'].values].X.toarray()
# adatas_visium = adatas_visium[:, ~adatas_visium.var['mt'].values]
# # add sample name to obs names
# adatas_visium.obs["sample"] = [str(i) for i in adatas_visium.obs['sample']]
# adatas_visium.obs_names = adatas_visium.obs["sample"] \
#                       + '_' + adatas_visium.obs_names
# # save
# os.system('mkdir -p '+ path+'/proliferative/')  
# adatas_visium.write(path + '/proliferative/proliferative_visium.h5ad')
    
    
# # Proliferative
# adatas_visium = sc.read_10x_h5(filtered_matrix_paths_S)
# adatas_visium.var_names_make_unique()
# # reading spot coordinates
# spot_coord = pd.read_csv(spot_coord_paths_S, header=None)
# spot_coord.columns = ["barcode","tissue","row","col","imagerow","imagecol"]
# spot_coord.set_index('barcode', inplace=True)
# # adding spot coordinates to adata object
# adatas_visium.obs['spot_id'] = adatas_visium.obs.index
# # subsetting to only spots in adata aka filtered (under tissue)
# spot_coord = spot_coord.loc[list(adatas_visium.obs['spot_id']),:]
# print('new shape of spot_coord', spot_coord.shape)
# X_spatial = [[spot_coord['imagecol'][j],spot_coord['imagerow'][j]] for j in range(len(spot_coord))]
# adatas_visium.obsm['spatial'] = np.array(X_spatial)
# adatas_visium.obs['sample'] = 'secretory'
# adatas_visium.obs['sample_id'] = '152807'
# # gene names
# adatas_visium.var['SYMBOL'] = adatas_visium.var_names
# adatas_visium.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True)
# adatas_visium.var_names = adatas_visium.var['ENSEMBL']
# adatas_visium.var.drop(columns='ENSEMBL', inplace=True)
# # Calculate QC metrics
# sc.pp.calculate_qc_metrics(adatas_visium, inplace=True)
# adatas_visium.var['mt'] = [gene.startswith('MT-') for gene in adatas_visium.var['SYMBOL']]
# adatas_visium.obs['mt_frac'] = adatas_visium[:, adatas_visium.var['mt'].tolist()].X.sum(1).A.squeeze()/adatas_visium.obs['total_counts']
# # mitochondria-encoded (MT) genes should be removed for spatial mapping
# adatas_visium.obsm['mt'] = adatas_visium[:, adatas_visium.var['mt'].values].X.toarray()
# adatas_visium = adatas_visium[:, ~adatas_visium.var['mt'].values]
# # add sample name to obs names
# adatas_visium.obs["sample"] = [str(i) for i in adatas_visium.obs['sample']]
# adatas_visium.obs_names = adatas_visium.obs["sample"] \
#                       + '_' + adatas_visium.obs_names
# # save
# os.system('mkdir -p '+ path+'/secretory/')  
# adatas_visium.write(path + '/secretory/secretory_visium.h5ad')