### Identify fetalization DARs across all cell types and compare their similarity across cell types

In [1]:
import pandas as pd 
import numpy as np
import pickle
import os
import itertools
import matplotlib.pyplot as plt 
import seaborn as sns
import matplotlib.colors as mcolors
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
fetalization_genes_dir = "fetalization_peaks/"
os.makedirs(fetalization_genes_dir, exist_ok=True)
plots_dir = "../../../paper_figures/"

In [3]:
# include all cell types besides Adipocytes which don't have fetal cells
cell_types = ["Cardiomyocyte", "Endothelial", "Fibroblast", "Myeloid", "Pericyte"]
len(cell_types)

5

In [6]:
def run_ORA_analysis(cell_type, log2FC_threshold = 0.5, p_adj_threshold=0.05):
    '''For a particular cell type, load in the fetal and disease DEGs, and run ORA for those genes that are 
    up in both and down in both. 

    Inputs: 
    - cell_type: The cell type for which to perform the analysis (loads in the results_dict) 
    - gene_set_gmt: The path to the gene sets to search for overrepresentation against
    - log2FC threshold and p_adj_threshold for DEGs 
    '''
    # open the results dictionary
    with open("pydeseq2_results/" + cell_type + "_results_dict.pkl", "rb") as f:
        results_dict = pickle.load(f)

    # get the disease and fetal results df
    disease_results_df = results_dict['disease-binary_Y_vs_N']
    fetal_results_df = results_dict['age-group_fetal_vs_young']

    # get the up and down genes for disease and fetal contrasts
    up_in_disease =  (disease_results_df[(disease_results_df['log2FoldChange'] > log2FC_threshold) & 
                  (disease_results_df['padj'] < p_adj_threshold) ])

    down_in_disease =  (disease_results_df[(disease_results_df['log2FoldChange'] < -log2FC_threshold) & 
                  (disease_results_df['padj'] < p_adj_threshold) ])

    up_in_fetal =  (fetal_results_df[(fetal_results_df['log2FoldChange'] > log2FC_threshold) & 
                      (fetal_results_df['padj'] < p_adj_threshold) ])
    
    down_in_fetal =  (fetal_results_df[(fetal_results_df['log2FoldChange'] < -log2FC_threshold) & 
                      (fetal_results_df['padj'] < p_adj_threshold) ])

    # extract the intersecting up genes
    up_in_disease_genes = up_in_disease.index
    up_in_fetal_genes = up_in_fetal.index
    intersecting_disease_fetal_up_genes = list ( set(up_in_disease_genes) & set(up_in_fetal_genes) )

    # extract the intersecting down genes
    down_in_disease_genes = down_in_disease.index
    down_in_fetal_genes = down_in_fetal.index    
    intersecting_disease_fetal_down_genes = list ( set(down_in_disease_genes) & set(down_in_fetal_genes) )

    return([intersecting_disease_fetal_up_genes, intersecting_disease_fetal_down_genes])

In [7]:
# test this for one cell type
for cell_type in cell_types:
    
    intersecting_disease_fetalization_up_genes, intersecting_disease_fetalization_down_genes = run_ORA_analysis(cell_type = cell_type)

    intersecting_disease_fetalization_up_genes = pd.DataFrame(intersecting_disease_fetalization_up_genes)
    intersecting_disease_fetalization_down_genes = pd.DataFrame(intersecting_disease_fetalization_down_genes)
    
    # save to csv
    fetal_up_path = fetalization_genes_dir + cell_type + "_up_fetalization_genes.csv"
    fetal_down_path = fetalization_genes_dir + cell_type + "_down_fetalization_genes.csv"
    
    intersecting_disease_fetalization_up_genes.to_csv(fetal_up_path, index=False)
    intersecting_disease_fetalization_down_genes.to_csv(fetal_down_path, index=False)

### Check degree of similarity between fetalization genes across all cell types

In [8]:
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

In [9]:
# load fetalization gene sets for all cell types
fetal_up_genes = {}
fetal_down_genes = {}

for cell_type in cell_types:
    up_path = os.path.join(fetalization_genes_dir, f"{cell_type}_up_fetalization_genes.csv")
    down_path = os.path.join(fetalization_genes_dir, f"{cell_type}_down_fetalization_genes.csv")

    # check if file exists and is not empty
    try:
        if os.path.exists(up_path) and os.path.getsize(up_path) > 0: 
            fetal_up_genes[cell_type] = set(pd.read_csv(up_path).iloc[:, 0].dropna())

        if os.path.exists(down_path) and os.path.getsize(down_path) > 0:
            fetal_down_genes[cell_type] = set(pd.read_csv(down_path).iloc[:, 0].dropna())

    except pd.errors.EmptyDataError:
        print(f"Warning: {cell_type} file is empty. Skipping...")

# compute pairwise Jaccard similarity matrices
cell_type_pairs = list(itertools.combinations(cell_types, 2))

jaccard_matrix_up = pd.DataFrame(0.0, index=cell_types, columns=cell_types)
jaccard_matrix_down = pd.DataFrame(0.0, index=cell_types, columns=cell_types)

for cell1, cell2 in cell_type_pairs:
    if cell1 in fetal_up_genes and cell2 in fetal_up_genes:
        jaccard_matrix_up.loc[cell1, cell2] = jaccard_similarity(fetal_up_genes[cell1], fetal_up_genes[cell2])
        jaccard_matrix_up.loc[cell2, cell1] = jaccard_matrix_up.loc[cell1, cell2] 
    if cell1 in fetal_down_genes and cell2 in fetal_down_genes:
        jaccard_matrix_down.loc[cell1, cell2] = jaccard_similarity(fetal_down_genes[cell1], fetal_down_genes[cell2])
        jaccard_matrix_down.loc[cell2, cell1] = jaccard_matrix_down.loc[cell1, cell2] 

# diagonal = 1
np.fill_diagonal(jaccard_matrix_up.values, 1)
np.fill_diagonal(jaccard_matrix_down.values, 1)

# round values 
jaccard_matrix_up = np.round(jaccard_matrix_up, 2)
jaccard_matrix_down = np.round(jaccard_matrix_down, 2)

In [None]:
# plot heatmaps
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# get max value that is not along diagonals
max_non_diagonal = jaccard_matrix_up[jaccard_matrix_up < 1].max().max() * 1.5
norm = mcolors.Normalize(vmin=0, vmax=max_non_diagonal)
sns.heatmap(jaccard_matrix_up, annot=True, cmap="Blues", ax=axes[0], norm=norm)
axes[0].set_title("Jaccard similarity for fetalization up genes across cell types")

max_non_diagonal = jaccard_matrix_down[jaccard_matrix_down < 1].max().max() * 1.5
norm = mcolors.Normalize(vmin=0, vmax=max_non_diagonal)
sns.heatmap(jaccard_matrix_down, annot=True, cmap="Reds", ax=axes[1], norm=norm)
axes[1].set_title("Jaccard similarity for fetalization down genes across cell types")

plt.tight_layout()
plt.savefig(plots_dir + "fetalization_DAR_Jaccard_similarity.pdf")
plt.show()