## Stitch HCA_F_RepTsp13447720 & HCA_F_RepTsp13447721 (Uterus)

In [None]:
import scanpy as sc

sc.settings.set_figure_params(figsize=(8,8))

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

Do a quick basic scanpy analysis of the limb to plot some clusters on it.

In [None]:
adata = sc.read('20_21_joint.h5ad')
adata = adata[adata.obs['in_tissue'] == 1]
sc.pl.spatial(adata, color="in_tissue")
sc.pl.spatial(adata, color="is_overlap")


### 2. Use the overlapping spots to assess differences in sequencing depth 

In [None]:
adata.var['SYMBOL'] = adata.var_names
adata.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True)
adata.var_names = adata.var['ENSEMBL']
adata.var.drop(columns='ENSEMBL', inplace=True)

In [None]:
overlap = adata[[i == 1 for i in adata.obs['is_overlap']]]
overlap

In [None]:
sc.pp.filter_genes(overlap, min_cells=5) 

In [None]:
# Calculate QC metrics
sc.pp.calculate_qc_metrics(overlap, inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 6.5))
sns.histplot(overlap.obs, x = overlap.obs["total_counts"], hue = overlap.obs["sample"], kde=False, ax=axs[0])
sns.histplot(overlap.obs, x = overlap.obs["n_genes_by_counts"], hue = overlap.obs["sample"], kde=False, bins=60, ax=axs[1])


In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[6,6], color_map=None, 
                         format='pdf', facecolor=None, transparent=False)

In [None]:
sc.pl.violin(overlap, ['total_counts', 'n_genes_by_counts'], groupby = 'sample', multi_panel = True)

### What are scale factors?

In single-cell RNA sequencing (scRNA-seq), **scale factors are normalization factors used to adjust for differences in sequencing depth across cells**.

When performing scRNA-seq, the number of transcripts captured can vary across cells due to technical factors such as variation in *cell lysis efficiency*, *cDNA synthesis efficiency*, and *sequencing depth*. This can result in differences in the number of sequencing reads obtained for each cell, which can confound downstream analyses. To address this issue, scale factors are used to adjust the read counts for each cell by a constant factor so that the total number of reads across all cells is equal.

Scanpy uses the **total-count normalization** method to compute scale factors. The total-count normalization method *assumes that the majority of genes are not differentially expressed between cells and adjusts for differences in sequencing depth by scaling the count data to a common library size*. The **scaling factor for each cell is the total count divided by the median count across all cells**.

The rationale for using the median instead of the mean to compute the normalization factor is to avoid bias from highly expressed genes that may be present in only a few cells.

### 3. Derive scale factors from overlapping spots 

Since we have a dataset where some cells/spots have "duplicated" measurements from two experiments, we can use the duplicated measurements to derive scale factors that can be applied to the other cells in the dataset. The basic idea is to use the duplicated measurements to estimate the technical variation between the two experiments and use this estimate to normalize the data.

One approach would be to use the duplicated measurements to **fit a linear regression model that relates the log-expression values of the *duplicated* cells/spots from the two experiments**. The **exp(slope) of the regression line provides an estimate of the technical variation between the two experiments**, which can be used as a scale factor to normalize the other cells.

The reason for exponentiating the slopes is that the slope in the log space is equivalent to the fold change between the two conditions. Exponentiating the slope yields the ratio of expression levels between the two conditions on a linear scale. Therefore, by exponentiating the slopes, we obtain a factor that can be directly used to rescale the data to the same scale, making it comparable between samples.

Basically, exponentiating the slopes is a convenient way to convert the fold change between conditions, which is measured on a logarithmic scale, to a linear scale that can be directly used as a normalization factor for the data.

In [None]:
import numpy as np

In [None]:
import json
with open("20_21_overlapping_barcodes.json", "r") as f:
    overlapping_barcodes = json.load(f)

In [None]:
overlap

Record the pairs of overlapping spots 

In [None]:
spot_pairs = {}
for k, v in overlapping_barcodes.items():
    if (k in overlap.obs_names.to_list()) & (v in overlap.obs_names.to_list()):
        barcode1_idx = np.where(overlap.obs_names == k)[0][0]
        barcode2_idx = np.where(overlap.obs_names == v)[0][0]
        
        spot_pairs[barcode1_idx] = barcode2_idx

Log transform the raw counts

In [None]:
log_counts = sc.pp.log1p(overlap, copy = True)

In [None]:
log_counts.X[list(spot_pairs.keys())].toarray().shape

In [None]:
log_counts.X[list(spot_pairs.values())].toarray().shape

In [None]:
from scipy.stats import linregress

In [None]:
slopes = []
genes = []
i = 0
for gene_counts in log_counts.X.toarray().T:
    x = gene_counts[list(spot_pairs.keys())]
    y = gene_counts[list(spot_pairs.values())]
    if (np.sum(x) != 0) & (np.sum(y) != 0):
        slope, _, _, _, _ = linregress(x, y)
        slopes.append(slope)
        genes.append(i)
        i += 1


Exponentiate the slopes (which were calculated in log-transformed space) to obtain the scale factors

In [None]:
scale_factors = np.exp(slopes)

In [None]:
# Plot scale factors 
plt.hist(scale_factors, bins=100)
plt.xlabel('Scale Factor')
plt.ylabel('Frequency')
plt.title('Distribution of Scale Factors')
plt.show()

In [None]:
# Record the genes that were used to compute scale factors 
genes_scale_factors = overlap[:, genes].var_names

In [None]:
# Record the overlapping spots 
spots_overlap = overlap[list(spot_pairs.keys()) + (list(spot_pairs.values())), :].obs_names

In [None]:
spots_overlap_20 = overlap[list(spot_pairs.keys())].obs_names

In [None]:
spots_overlap_21 = overlap[list(spot_pairs.values())].obs_names

### Plot average expression of genes vs scale factors 

In [None]:
# Calculate average expression of each gene across all cells
import pandas as pd
overlap_plot = overlap[:, genes_scale_factors]
avg_exp = overlap_plot.X.mean(axis=0)
# Convert the average expression to a pandas dataframe
df_avg_exp = pd.DataFrame(data=avg_exp.A1, index=overlap_plot.var_names, columns=['avg_expression'])
df_avg_exp.head()

In [None]:
# Add scale factors 
df_avg_exp['scale_factors'] = scale_factors
# Sort the dataframe by average expression in ascending order
df_avg_exp_sorted = df_avg_exp.sort_values(by='avg_expression', ascending=True)
df_avg_exp_sorted.head()

In [None]:
# Create scatterplot of mean expression versus scale factors for sorted genes
plt.scatter(df_avg_exp_sorted['avg_expression'], df_avg_exp_sorted['scale_factors'])
plt.xlabel('Average Expression')
plt.ylabel('Scale Factors')
plt.title('Average Gene Expression vs Scale Factors')
plt.show()

In [None]:
from scipy.stats import trim_mean

# Calculate the 10% trimmed median
trimmed_mean = trim_mean(scale_factors, 0.1)
trimmed_mean

In [None]:
median = np.median(scale_factors)
median

In [None]:
# Normalise overlapping spots from slide 21 and plot distribution of counts again for comparison 

# Create a mask for the barcodes to normalise
normalise_mask = np.isin(overlap_plot.obs.index, spots_overlap_21)

# Copy the original count matrix to a new matrix
normalised_counts = overlap_plot.X.copy()

# Normalise the counts for the selected barcodes
normalised_counts[normalise_mask] /= trimmed_mean

# Create a new anndata object with the normalised counts
import anndata
normalised_overlap = anndata.AnnData(X=normalised_counts, obs=overlap_plot.obs, var=overlap_plot.var)


In [None]:
for col in [ 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes']:
    del normalised_overlap.obs[col]

# Calculate QC metrics
sc.pp.calculate_qc_metrics(normalised_overlap, inplace=True)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 6.5))
sns.histplot(normalised_overlap.obs, x = normalised_overlap.obs["total_counts"], hue = normalised_overlap.obs["sample"], kde=False, ax=axs[0])
sns.histplot(normalised_overlap.obs, x = normalised_overlap.obs["n_genes_by_counts"], hue = normalised_overlap.obs["sample"], kde=False, bins=60, ax=axs[1])

In [None]:
sc.pl.violin(normalised_overlap, ['total_counts', 'n_genes_by_counts'], groupby = 'sample', multi_panel = True)

### 4. Normalise all spots from slide 21 based on the scale factors computed from overlapping spots

In [None]:
spots_non_overlap = adata[[i not in spots_overlap.tolist() for i in adata.obs_names]].obs_names.to_list()
spots_non_overlap_20 = [i for i in spots_non_overlap if i.startswith('HCA_F_RepTsp13447720_')]
print('Non-overlapping spots from slide 20: {}'.format(len(spots_non_overlap_20)))
spots_non_overlap_21 = [i for i in spots_non_overlap if i.startswith('HCA_F_RepTsp13447721_')]
print('Non-overlapping spots from slide 21: {}'.format(len(spots_non_overlap_21)))


In [None]:
all_spots_21 = spots_overlap_21.to_list() + spots_non_overlap_21

In [None]:
len(all_spots_21)

In [None]:
# Subset genes 
slide_21 = adata[:, genes_scale_factors]
slide_21 = slide_21[[i in all_spots_21 for i in slide_21.obs_names]]

In [None]:
slide_21

In [None]:
# Normalize count data by scale factors
slide_21.X = slide_21.X / trimmed_mean

In [None]:
slide_21.X.toarray()[20:30, 20:30]

### 5. Concatenate normalised slide 21 with non-overlapping spots of slide 20

In [None]:
slide_20 = adata[:, genes_scale_factors]
slide_20 = slide_20[[i in spots_non_overlap_20 for i in slide_20.obs_names]]
slide_20

In [None]:
slide_20.X.toarray()[20:30, 20:30]

In [None]:
res = slide_20.concatenate(slide_21, index_unique=None)
res

In [None]:
res.raw = res.copy()

In [None]:
# Calculate QC metrics
sc.pp.calculate_qc_metrics(res, inplace=True)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 6.5))
sns.histplot(res.obs, x = res.obs["total_counts"], hue = res.obs["sample"], kde=False, ax=axs[0])
sns.histplot(res.obs, x = res.obs["n_genes_by_counts"], hue = res.obs["sample"], kde=False, bins=60, ax=axs[1])

In [None]:
sc.pp.normalize_total(res, target_sum=1e4, exclude_highly_expressed = True)

sc.pp.log1p(res)
sc.pp.highly_variable_genes(res, min_mean=0.0125, max_mean=3, min_disp=0.5)
res

In [None]:
np.max(res.X.toarray())

In [None]:
sc.pp.scale(res, max_value=10)
sc.tl.pca(res, svd_solver='arpack')
sc.pl.pca_variance_ratio(res, log=True)

In [None]:
np.max(res.X)

In [None]:
# Re-add spatial info 
res.uns["spatial"] = dict()
library_id = "joint"
res.uns["spatial"][library_id] = dict()

In [None]:
from matplotlib.image import imread

res.uns["spatial"][library_id]['images'] = dict()
res.uns["spatial"][library_id]['images']["hires"] = imread("20_21_tissue_hires_image.png")

In [None]:
import json

with open("/nfs/team292/vl6/FetalReproductiveTract/VISIUM/data/HCA_F_RepTsp13447720/spatial/scalefactors_json.json", "r") as fid:
    res.uns["spatial"][library_id]['scalefactors'] = json.load(fid)

In [None]:
import pandas as pd

p1 = pd.read_csv("20_tissue_positions_list.csv", header=None)
p2 = pd.read_csv("21_tissue_positions_list.csv", header=None)

In [None]:
p1.columns = [
    'barcode',
    'in_tissue',
    'array_row',
    'array_col',
    'pxl_col_in_fullres',
    'pxl_row_in_fullres',
]

p1.index = p1['barcode']

p2.columns = [
    'barcode',
    'in_tissue',
    'array_row',
    'array_col',
    'pxl_col_in_fullres',
    'pxl_row_in_fullres',
]

p2.index = p2['barcode']


In [None]:
positions = pd.concat([p1, p2])
positions.drop(columns=['in_tissue', 'array_row', 'array_col'],
    inplace=True)
res.obs['barcode'] = res.obs_names
res.obs = res.obs.join(positions, how="inner", on = 'barcode', lsuffix = 'L')
res.obsm['spatial'] = res.obs[
    ['pxl_row_in_fullres', 'pxl_col_in_fullres']
].to_numpy()
spatial = res.obsm['spatial'].copy()
res.obsm['spatial'][:,0] = spatial[:,1]
res.obsm['spatial'][:,1] = spatial[:,0]

In [None]:
res

In [None]:
sc.pp.neighbors(res, n_pcs=10)
sc.tl.leiden(res)
sc.pl.spatial(res, color="leiden")

Hey that doesn't look half bad. Turn the resolution down for a clearer overview.

In [None]:
sc.tl.leiden(res, resolution=0.4)
sc.pl.spatial(res, color="leiden", save = '_leiden_clustering')

In [None]:
sc.pl.spatial(res, color=['PAX8', 'DLX5', 'UCA1', 'LGR5', 'MSX1', 'ITGA4', 'MYH11', 'DLK1', 
                            'C7', 'PTGER3', 'ALAS2', 'LYVE1', 'PLP1', 'SRD5A2' ], gene_symbols = 'SYMBOL', ncols = 2, 
             cmap = 'jet', use_raw = False)

In [None]:
sc.pl.pca(res, color = 'sample')

In [None]:
sc.pl.pca(res, color = 'leiden')

In [None]:
res.write('/lustre/scratch126/cellgen/team292/vl6/VISIUM/HCA_F_RepTsp13447720_HCA_F_RepTsp13447721/HCA_F_RepTsp13447720_HCA_F_RepTsp13447721_visium.h5ad')

In [None]:
res = sc.read('/lustre/scratch126/cellgen/team292/vl6/VISIUM/HCA_F_RepTsp13447720_HCA_F_RepTsp13447721/HCA_F_RepTsp13447720_HCA_F_RepTsp13447721_visium.h5ad')

In [None]:
import pandas as pd

In [None]:
axis = pd.read_csv('/lustre/scratch126/cellgen/team292/vl6/VISIUM/HCA_F_RepTsp13447720_HCA_F_RepTsp13447721/UterineVaginalAxis.csv', index_col = 0)
axis.head()

In [None]:
res.obs['UteroCervical Axis'] = res.obs_names.map(axis['UteroVaginal_Axis'].to_dict())

In [None]:
sc.pl.spatial(res, color="UteroCervical Axis", cmap = 'jet', save = '_axis')

In [None]:
sc.pl.spatial(res, color=['CFAP157', 'CCDC114', 'VPS13D', 'CFAP70', 'SPAG17', 'VWA3A', 'BMP7', 'PIFO', 'CFAP45', 'NOTUM', 'CTNNA2', 'ID3', 'MRM3', 'RAMP2', 'CDK17', 
                         'TMEM183A', 'RHEX', 'SIX1', 'DLX5',  'MSX2', 'SLC14A1', 'SUSD2', 'NKD1', 'HS3ST3A1', 'CDH2', 'WIF1', 
                         'MUC5B', 'TFF3', 'EPAS1', 'CASTOR1', 'GPC3', 'MMP7', 'CLCN5', 'IGF1', 'TMX2'], gene_symbols = 'SYMBOL', ncols = 3, 
             cmap = 'jet', use_raw = False)

In [None]:
adata = sc.read('/lustre/scratch126/cellgen/team292/vl6/VISIUM/cell2location_v0.1_merged/female_axis/predmodel/sp.h5ad')
adata

In [None]:
adata.obs[adata.uns['mod']['factor_names']] = adata.obsm['q05_cell_abundance_w_sf']

In [None]:
for col in adata.obs[['CoelomicEpithelial',
       'Epithelial_FallopianTube', 'Epithelial_Uterus', 'Epithelial_Ciliated',
       'Epithelial_LowerTract_1_SNX31', 'Epithelial_LowerTract_2_HOXB13',
       'Epithelial_LowerTract_3', 'Epithelial_LowerTract_4',
       'Mesenchymal_FallopianTube', 'Mesenchymal_Uterus',
       'Mesenchymal_LowerTract_1_SRD5A2', 'Mesenchymal_LowerTract_2_PGR',
       'Mesenchymal_LowerTract_3', 'SmoothMuscle_FallopianTube',
       'SmoothMuscle_Uterus', 'SmoothMuscle_LowerTract',
       'Ligament_FallopianTube', 'Ligament_Uterus',
       'Ligament_LowerTract_1_PRDM8', 'Ligament_LowerTract_2_TBX18',
       'Epoophron', 'PV', 'Pre_PV', 'Endothelial', 'Endothelial_lymphatic',
       'Immune', 'Neural_1_PLP1', 'Neural_2_STMN2']]:
    res.obs[col] = adata.obs[col].to_dict()

In [None]:
sc.pl.spatial(res, cmap='jet',
                  # show first 8 cell types
                  color=['CoelomicEpithelial', 'Epithelial_FallopianTube', 'Epithelial_Uterus', 'Epithelial_Ciliated', 'Epithelial_LowerTract_1_SNX31', 
                         'Epithelial_LowerTract_2_HOXB13', 'Epithelial_LowerTract_3', 'Epithelial_LowerTract_4', 'Mesenchymal_FallopianTube', 
                         'Mesenchymal_Uterus', 'Mesenchymal_LowerTract_1_SRD5A2', 'Mesenchymal_LowerTract_2_PGR', 'Mesenchymal_LowerTract_3', 
                         'SmoothMuscle_FallopianTube', 'SmoothMuscle_Uterus', 'SmoothMuscle_LowerTract', 'Ligament_FallopianTube', 'Ligament_Uterus', 
                         'Ligament_LowerTract_1_PRDM8', 'Ligament_LowerTract_2_TBX18', 'Epoophron', 'PV', 'Pre_PV', 'Endothelial', 'Endothelial_lymphatic', 
                         'Immune', 'Neural_1_PLP1', 'Neural_2_STMN2'],
                  ncols=3, size=1.3,
                  #img_key='hires5K',
                  # limit color scale at 99.2% quantile of cell abundance
                  vmin=0, vmax='p99.2'
                 )