In [None]:
import pandas as pd
import numpy as np
import re
from scipy import stats
import os

import statsmodels.api as sm
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sn
import matplotlib.mlab as mlab
from sklearn.preprocessing import StandardScaler
%matplotlib inline

root_dir = "/your/path"

In [None]:
try:
    os.mkdir(f"{root_dir}/figures")
except OSError as error: 
    pass

try:
    os.mkdir(f"{root_dir}/figures/dependent")
except OSError as error:
    pass

In [None]:
def get_original_features(root_dir):
    dataset = pd.read_excel(f"{root_dir}/data/file.xlsx")
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "MB" if x == "MEDULLOBLASTOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "EP" if x == "EPENDYMOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "PA" if x == "PILOCYTIC ASTROCYTOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "BG" if x == "GLIOMA" else x)
    filtered_mb = dataset[dataset["TUMOR_TYPE"] == "MB"].head(25)
    filtered_ep = dataset[dataset["TUMOR_TYPE"] == "EP"]
    filtered_pa = dataset[dataset["TUMOR_TYPE"] == "PA"].head(25)
    filtered_bg = dataset[dataset["TUMOR_TYPE"] == "BG"].head(25)
    return filtered_mb, filtered_ep, filtered_pa, filtered_bg

In [None]:
filtered_mb, filtered_ep, filtered_pa, filtered_bg = get_original_features(root_dir)
filtered_mb.name = "MB"
filtered_ep.name = "EP"
filtered_pa.name = "PA"
filtered_bg.name = "BG"

In [None]:
def change_labels(dataset):
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "MB" if x == 0 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "EP" if x == 1 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "PA" if x == 2 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "BG" if x == 3 else x)
    return dataset

In [None]:
def get_best_features(root_dir, original, desired):
    file_path = f'{root_dir}/counterfactuals/statistics_of_cfes/{original}/{original}_to_{desired}_summary.txt'

    with open(file_path, 'r') as file:
        file_content = file.read()

    # Extract the max counted 3 features using regex
    pattern = r"^(.+?): \d+ changes$"  # Regex pattern to match the feature lines
    matches = re.findall(pattern, file_content, flags=re.MULTILINE)

    # We have already sorted the features by their counts in the file, so
    # We can extract the max counted 3 feature names as a list
    max_counted_3_features = matches[:3]

    return max_counted_3_features

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def get_gaussian_dists(root_dir, original_data_dict, original, desired, first_three_features):
    
    palette = "mako"
    common_norm = False
    alpha = 0.3
    linewidth = 0.9
    label_size = 18

    for name, data in original_data_dict.items():
        for feature in first_three_features:
    
            # Convert the columns to numeric data types
            data[feature] = pd.to_numeric(data[feature])
            data = data.rename(columns={'TUMOR_TYPE': 'Tumor Type'})

            ### KDE ###
        
            # Save the results
            filename_pdf = f'{root_dir}/figures/dependent/{name}_{feature}.pdf'
            filename_png = f'{root_dir}/figures/dependent/{name}_{feature}.png'

            print(f"Plotting {desired} {feature} KDE")
            plt.figure(dpi=300)
            sns.set_style('dark', {'axes.grid' : False})
            with sns.plotting_context(rc={"axes.labelsize": label_size}):
                sns_hist = sns.kdeplot(
                    data=data, x=feature, hue="Tumor Type",
                    fill=True, palette=palette, common_norm=common_norm, alpha=alpha, linewidth=linewidth
                )
            plt.tight_layout()
            plt.savefig(filename_pdf)
            plt.savefig(filename_png)


In [None]:
import itertools

original_class = ["MB", "EP", "BG", "PA"]
desired_class = ["MB", "EP", "BG", "PA"]

original_data_list = [filtered_mb, filtered_ep, filtered_bg, filtered_pa]

paired_datasets = {}
for dataset1, dataset2 in itertools.combinations(original_data_list, 2):
    concatenated_dataset = pd.concat([dataset1, dataset2], ignore_index=True)
    pair_name = f"{dataset1.name}_{dataset2.name}"
    paired_datasets[pair_name] = concatenated_dataset

for original in original_class:
    for desired in desired_class:
        if original != desired:
            best_features = get_best_features(root_dir, original, desired)
            get_gaussian_dists(root_dir, paired_datasets, original, desired, best_features)
