### Specifically determine the percentage of total # of fetal DARs that overlap with aging 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

#### Create function for assessing the percentage of overlapping DARs

In [2]:
def determine_DAR_proportional_overlap(overlap_df_path, disease_DAR_path, fetal_DAR_path):
    '''Loads in the three dfs from the prior DAR analysis to identify the percentage of fetal DARs that are in the unidirectional
    overlap categories, and the percentage of disease DARs that are in the unidirectional overlap categories. 

    Returns the percentage of (1) up shared DARs / total disease up; (2) up shared DARs / total fetal up;
    (3) down shared DARs / total disease down; (4) down shared DARs / total fetal down 
    '''
    
    overlap_df = pd.read_csv(overlap_df_path)
    # get the number of overlapping up and down genes
    up_both = overlap_df['up_both'].iloc[0]
    down_both = overlap_df['down_both'].iloc[0]

    # get the number of disease DARs
    disease_DARs = pd.read_csv(disease_DAR_path, index_col = 0)
    num_disease_up_DARs = disease_DARs[(disease_DARs['significant'] == True) & (disease_DARs['log2FoldChange'] > 0)].shape[0]
    num_disease_down_DARs = disease_DARs[(disease_DARs['significant'] == True) & (disease_DARs['log2FoldChange'] < 0)].shape[0]

    # get the number of fetal DARs
    fetal_DARs = pd.read_csv(fetal_DAR_path, index_col = 0)
    num_fetal_up_DARs = fetal_DARs[(fetal_DARs['significant'] == True) & (fetal_DARs['log2FoldChange'] > 0)].shape[0]
    num_fetal_down_DARs = fetal_DARs[(fetal_DARs['significant'] == True) & (fetal_DARs['log2FoldChange'] < 0)].shape[0]

    # get the overlap percentages
    perc_up_overlap_in_disease = up_both / num_disease_up_DARs
    perc_down_overlap_in_disease = down_both / num_disease_down_DARs

    perc_up_overlap_in_fetal = up_both / num_fetal_up_DARs
    perc_down_overlap_in_fetal = down_both / num_fetal_down_DARs

    return [perc_up_overlap_in_disease, perc_down_overlap_in_disease, perc_up_overlap_in_fetal, perc_down_overlap_in_fetal]

### Perform this for all cell types besides Adipocytes, which do not have the fetal contrast

In [3]:
cell_types = ["Cardiomyocyte", "Endothelial", "Fibroblast", "Lymphoid", "Myeloid", "Pericyte"]

In [4]:
proportion_list = []

for cell_type in cell_types: 
    overlap_df_path = "DAR_contrasts_intersection_plots/" + cell_type + "_fetal_disease_overlap_contrast_analysis.csv"
    disease_DAR_path = "pydeseq2_results/" + cell_type + "_disease-binary_Y_vs_N_results.csv"
    fetal_DAR_path = "pydeseq2_results/" + cell_type + "_age-group_fetal_vs_young_results.csv"
    
    results = determine_DAR_proportional_overlap(overlap_df_path = overlap_df_path, 
                                      disease_DAR_path = disease_DAR_path,
                                      fetal_DAR_path = fetal_DAR_path)
    proportion_list.append(results)

  perc_up_overlap_in_disease = up_both / num_disease_up_DARs
  perc_down_overlap_in_disease = down_both / num_disease_down_DARs


In [5]:
proportion_df = pd.DataFrame(proportion_list)
proportion_df.index = cell_types
proportion_df.columns = ["overlap_up_disease", "overlap_down_disease", "overlap_up_fetal", "overlap_down_fetal"]
proportion_df = proportion_df * 100 # get percentages
proportion_df = proportion_df.reset_index().rename(columns = {"index": "cell_type"})

In [7]:
# make this tidy

tidy_df = pd.melt(
    proportion_df, 
    id_vars=["cell_type"], 
    value_vars=["overlap_up_disease", "overlap_down_disease", "overlap_up_fetal", "overlap_down_fetal"],
    var_name="category", 
    value_name="value"
)

# Extract `direction` and `contrast` from the `category` column
tidy_df["direction"] = tidy_df["category"].str.extract(r"_(up|down)")
tidy_df["contrast"] = tidy_df["category"].str.extract(r"_(disease|fetal)")

tidy_df.to_csv("04B_overlap_percentage_fetal_disease.csv")

### Proceed to step 06C for producing the ggplot version since we want to use the same plotting and color scheme