In [None]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

## VISIUM smoothers

In [None]:
visium_smoothers_df = pd.read_csv('/lustre/scratch126/cellgen/team292/vl6/VISIUM/fallopian_axis_visium_fitted_values_tradeseq_epithelium.csv', 
                              index_col = 0)
print(visium_smoothers_df.shape)
visium_smoothers_df.head()

## scRNA-seq anndata object for bona fide secretory epithelial genes

In [None]:
import scanpy as sc
import anndata

In [None]:
scrnaseq = sc.read('/nfs/team292/vl6/FetalReproductiveTract/post_10pcw_females.20240326.h5ad')
scrnaseq.obs['celltype'].value_counts()

In [None]:
# Select cell types that might be contaminating the Visium signal 
scrnaseq = scrnaseq[[i in ['FallopianTube_Mesenchyme', 'Ciliated_Epithelium', 'MesonephricTubules_Epithelium', 
                          'Endothelial_Lymphatic', 'Erythroid', 'Neural', 'Immune', 'Coelomic_Epithelium', 
                          'FallopianTube_SmoothMuscle', 'Pre-Perivascular', 'SchwannCell', 
                          'FallopianTube_Epithelium', 'Endothelial', 'Perivascular', 
                          'Epoophron_Mesenchyme', 'FallopianTube_Ligament'] for i in scrnaseq.obs['celltype']]]
scrnaseq.shape

## Compare expression of genes across cell types

In [None]:
genes = visium_smoothers_df.index.to_list()
len(genes)

In [None]:
genes = [i for i in genes if i in scrnaseq.var_names.to_list()]

In [None]:
len(genes)

In [None]:
cell_type_of_interest = 'FallopianTube_Epithelium'

In [None]:
# Calculate average expression per cell type
average_expression = scrnaseq.to_df().groupby(scrnaseq.obs['celltype']).mean()

# Filter the average expression table to include only the genes of interest
average_expression = average_expression.loc[:, genes]

# Create a table to summarize the results
summary_table = average_expression.reset_index()

In [None]:
# First filtering step: Filter the genes based on the criteria that their expression in cell type "A" is within the top 3 cell types
filtered_genes = []
for gene in genes:
    # Sort the average expression of the gene across all cell types in descending order
    sorted_expression = average_expression[gene].sort_values(ascending=False)
    
    # Check if the cell type of interest is within the top 4 cell types
    if cell_type_of_interest in sorted_expression.index[:4]:
        filtered_genes.append(gene)

# Filter the summary table to retain only the filtered genes
filtered_summary_table = summary_table[['celltype'] + filtered_genes]

In [None]:
filtered_summary_table

In [None]:
filtered_genes_step2 = []
for gene in filtered_genes:
    expression_interest = average_expression.loc[cell_type_of_interest, gene]
    expression_comparison = average_expression.loc["Ciliated_Epithelium", gene]
    
    # Check if the expression in the cell type of interest is greater than in the comparison cell type
    # or if it's not more than 30% less than in the comparison cell type
    if expression_interest > expression_comparison or expression_comparison <= 1.2 * expression_interest:
        filtered_genes_step2.append(gene)

# Filter the summary table to retain only the filtered genes from the second step
filtered_summary_table_step2 = filtered_summary_table[['celltype'] + filtered_genes_step2]


In [None]:
filtered_summary_table_step2

### Save remaining genes and plot their trends in TradeSeq 

In [None]:
print(filtered_summary_table_step2.columns.to_list())