In [None]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

## VISIUM

In [None]:
visium_smoothers_df = pd.read_csv('/lustre/scratch126/cellgen/team292/vl6/VISIUM/femalereproductiveaxis_visium_downsampled_fitted_values_tradeseq_epithelium.csv', 
                              index_col = 0)
print(visium_smoothers_df.shape)
visium_smoothers_df.head()

## scRNA-seq

In [None]:
scrnaseq_smoothers_df = pd.read_csv('/lustre/scratch126/cellgen/team292/vl6/VISIUM/epi_femalereproductiveaxis_scrnaseq_downsampled_fitted_values_tradeseq.csv', 
                               index_col = 0)
print(scrnaseq_smoothers_df.shape)
scrnaseq_smoothers_df.head()

## Comparison of smoothers

In [None]:
import scipy.stats

### How many genes are in common between the two? 

In [None]:
scrnaseq_genes = scrnaseq_smoothers_df.index.tolist()
visium_genes = visium_smoothers_df.index.tolist()
common_genes = list(set(scrnaseq_genes) & set(visium_genes))
scrnaseq_unique = [i for i in scrnaseq_genes if i not in visium_genes]
visium_unique = [i for i in visium_genes if i not in scrnaseq_genes]

In [None]:
len(common_genes), len(scrnaseq_unique), len(visium_unique)

In [None]:
'PNOC' in visium_unique

In [None]:
'PNOC' in common_genes

In [None]:
import matplotlib_venn

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

# Define the sets
set_visium = 160  # Unique elements in visium
set_scrnaseq = 237  # Unique elements in scrnaseq
intersection = 133

# Create the Venn diagram
venn = venn2(subsets=(set_visium, set_scrnaseq, intersection), set_labels=('Visium', 'scRNA-seq'))

# Customize the colors
venn.get_patch_by_id('10').set_color('blue') # Set A color
venn.get_patch_by_id('01').set_color('orange') # Set B color
venn.get_patch_by_id('11').set_color('yellowgreen') # Intersection color

# Adjust the transparency (alpha)
venn.get_patch_by_id('10').set_alpha(0.5)
venn.get_patch_by_id('01').set_alpha(0.5)
venn.get_patch_by_id('11').set_alpha(0.7)

# Save the plot as a PDF
pdf_filename = 'venn_diagram_epithelium.pdf'
plt.savefig(pdf_filename, format='pdf')


# Display the plot
plt.show()

In [None]:
scrnaseq_smoothers_df_common = scrnaseq_smoothers_df.loc[common_genes]

In [None]:
visium_smoothers_df_common = visium_smoothers_df.loc[common_genes]

In [None]:
scrnaseq_smoothers_mtx_common = scrnaseq_smoothers_df_common.to_numpy()
visium_smoothers_mtx_common = visium_smoothers_df_common.to_numpy()

### 1. Non-parametric correlation between common genes (Spearman's rank correlation test)

In [None]:
spearman_correlations = []
for i in range(scrnaseq_smoothers_mtx_common.shape[0]):
    corr, _ = scipy.stats.spearmanr(scrnaseq_smoothers_mtx_common[i, :], visium_smoothers_mtx_common[i, :])
    spearman_correlations.append(corr)


In [None]:
len(spearman_correlations)

In [None]:
import matplotlib.pyplot as plt

plt.hist(spearman_correlations, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Spearman Correlation Coefficients')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Frequency')
plt.show()


In [None]:
len(np.asarray(np.array(spearman_correlations) > 0.7).nonzero()[0].tolist())

### 2. Cosine similarity 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Assuming matrix1 and matrix2 are your two matrices
# They should have the same shape: (number_of_genes, number_of_pseudotime_points)

num_genes = scrnaseq_smoothers_mtx_common.shape[0]
gene_cosine_similarities = np.zeros(num_genes)

for i in range(num_genes):
    # Reshape the rows to be 2D arrays as required by cosine_similarity
    gene1 = scrnaseq_smoothers_mtx_common[i, :].reshape(1, -1)
    gene2 = visium_smoothers_mtx_common[i, :].reshape(1, -1)

    # Compute cosine similarity and store it
    gene_cosine_similarities[i] = cosine_similarity(gene1, gene2)[0, 0]

# 'gene_cosine_similarities' now contains the cosine similarity for each gene pair



In [None]:
len(gene_cosine_similarities)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(5, 4))
plt.hist(gene_cosine_similarities,bins=20, color='gainsboro', edgecolor='black')
plt.title('Distribution of Cosine Similarities')
plt.xlabel('Cosine similarity')
plt.ylabel('Frequency')

# Save the plot as a PDF
histogram_pdf = 'cosine_similarities_histogram_epithelium.pdf'
plt.savefig(histogram_pdf, format='pdf')

plt.show()


In [None]:
len(np.asarray(np.array(gene_cosine_similarities) > 0.9).nonzero()[0].tolist())

In [None]:
common_pattern_genes = scrnaseq_smoothers_df.iloc[np.asarray(np.array(gene_cosine_similarities) > 0.9).nonzero()[0].tolist()].index.tolist()

## Select common genes + scRNA-seq specific genes 

In [None]:
tot_genes = common_genes.copy()
tot_genes.extend(scrnaseq_unique)

In [None]:
len(tot_genes)

## Intersect prioritised genes with human TFs

In [None]:
tfs = pd.read_csv('/nfs/team292/vl6/FetalReproductiveTract/humanTFs/DatabaseExtract_v_1.01.csv')

In [None]:
tfs['Is TF?'].value_counts()

In [None]:
tfs['TF assessment'].value_counts()

In [None]:
tfs = tfs[tfs['Is TF?'] == 'Yes']
tfs = tfs[tfs['TF assessment'] == 'Known motif']

In [None]:
tfs = tfs['HGNC symbol'].tolist()

In [None]:
len(tfs)

In [None]:
'CD36' in tfs

In [None]:
tfs_prioritised = [i for i in tot_genes if i in tfs]

In [None]:
len(tfs_prioritised)

In [None]:
# tfs_prioritised = [i for i in tfs_prioritised if not i.startswith("HOX")]

In [None]:
len(tfs_prioritised)

In [None]:
print(tfs_prioritised)

In [None]:
tfs_prioritised = [i for i in tfs_prioritised if i not in ['CEBPD', 'SOX4', 'ZNF770','IRF1','EGR1', 'FOSL2', 
                                                          'MAFB']]

In [None]:
len(tfs_prioritised)

In [None]:
scrnaseq_smoothers_df_tfs = scrnaseq_smoothers_df.loc[tfs_prioritised]

In [None]:
scrnaseq_smoothers_mtx_tfs = scrnaseq_smoothers_df_tfs.to_numpy()

In [None]:
tfs_prioritised

## Cluster TFs by spatial expression pattern

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scrnaseq_smoothers_mtx_tfs_scaled = scaler.fit_transform(scrnaseq_smoothers_mtx_tfs)

In [None]:
scrnaseq_smoothers_mtx_tfs_scaled.shape

In [None]:
# Perform hierarchical clustering
scrnaseq_smoothers_mtx_tfs_scaled_Z = linkage(scrnaseq_smoothers_mtx_tfs_scaled, method='ward', 
                                             optimal_ordering = True)

In [None]:
common_tfs = [i for i in common_genes if i in tfs_prioritised]

In [None]:
print(tfs_prioritised)

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
plt.figure(figsize=(6, 2.5))
plt.title("Hierarchical clustering dendrogram of spatially-variable TFs")

# Create the dendrogram
dendro = dendrogram(scrnaseq_smoothers_mtx_tfs_scaled_Z, labels=scrnaseq_smoothers_df_tfs.index.to_list(),
           leaf_rotation=45, leaf_font_size=10)

# Highlight the common TFs
ax = plt.gca()
x_labels = ax.get_xmajorticklabels()
for label in x_labels:
    if label.get_text() in common_tfs:
        label.set_fontweight('bold')

plt.xlabel("TFs")
plt.ylabel("Distance")

# Save the plot as a PDF
dendrogram_pdf_path = 'hierarchical_clustering_dendrogram_epithelium.pdf'
plt.savefig(dendrogram_pdf_path, format='pdf', bbox_inches='tight')  # bbox_inches='tight' ensures that labels are not cut off


plt.show()

In [None]:
from scipy.cluster.hierarchy import fcluster

In [None]:
# Choosing a distance cutoff (or setting a specific number of clusters)
distance_cutoff = 10  # example value, adjust based on your dendrogram
clusters = fcluster(scrnaseq_smoothers_mtx_tfs_scaled_Z, distance_cutoff, criterion='distance')

# clusters now contains the cluster ID for each gene

In [None]:
len(np.unique(clusters))

In [None]:
cluster_number = 1

In [None]:
gene_indices_in_cluster = np.asarray(clusters == cluster_number).nonzero()[0].tolist()

In [None]:
scrnaseq_smoothers_df_tfs.iloc[gene_indices_in_cluster]

In [None]:
cluster_genes = scrnaseq_smoothers_df_tfs.iloc[gene_indices_in_cluster].index.to_list()

In [None]:
fitted_values_cluster = scrnaseq_smoothers_mtx_tfs[gene_indices_in_cluster, :]

In [None]:
fitted_values_cluster.shape

In [None]:
cluster_genes[0]

In [None]:
plt.figure(figsize=(8
            , 7))
pseudospace = np.linspace(-1, 5, 100)
i = 0
for gene_fitted_values in fitted_values_cluster:
    plt.plot(pseudospace, gene_fitted_values, 
             alpha = 0.5, label=cluster_genes[i])  # Plot each gene's spline
    i = i+1

plt.title(f"Splines for Genes in Cluster {cluster_number}")
plt.xlabel("Müllerian longitudinal axis")
plt.ylabel("Fitted Values")
# Display the legend
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
# Save the plot as a PDF
cluster3_tfs = 'cluster11_tfs.pdf'
plt.savefig(cluster3_tfs, format='pdf', bbox_inches='tight')  # bbox_inches='tight' ensures that labels are not cut off

plt.show()
