In [None]:
import pandas as pd
import numpy as np
import re
from scipy import stats
import os

import statsmodels.api as sm
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sn
import matplotlib.mlab as mlab
from sklearn.preprocessing import StandardScaler
%matplotlib inline

root_dir = "/your/path"

In [None]:
try:
    os.mkdir(f"{root_dir}/figures")
except OSError as error: 
    pass

try:
    os.mkdir(f"{root_dir}/figures/distributions")
except OSError as error:
    pass

In [None]:
def get_original_features(root_dir):
    dataset = pd.read_excel(f"{root_dir}/data/file.xlsx")
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "MB" if x == "MEDULLOBLASTOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "EP" if x == "EPENDYMOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "PA" if x == "PILOCYTIC ASTROCYTOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "BG" if x == "GLIOMA" else x)
    filtered_mb = dataset[dataset["TUMOR_TYPE"] == "MB"].head(25)
    filtered_ep = dataset[dataset["TUMOR_TYPE"] == "EP"]
    filtered_pa = dataset[dataset["TUMOR_TYPE"] == "PA"].head(25)
    filtered_bg = dataset[dataset["TUMOR_TYPE"] == "BG"].head(25)
    return filtered_mb, filtered_ep, filtered_pa, filtered_bg

In [None]:
filtered_mb, filtered_ep, filtered_pa, filtered_bg = get_original_features(root_dir)
filtered_mb.name = "MB"
filtered_ep.name = "EP"
filtered_pa.name = "PA"
filtered_bg.name = "BG"

In [None]:
def change_labels(dataset):
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "MB" if x == 0 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "EP" if x == 1 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "PA" if x == 2 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "BG" if x == 3 else x)
    return dataset

In [None]:
def get_gaussian_dists(root_dir, original_data_list, counterfactuals, original, desired, first_three_features):
    
    palette = "mako"
    common_norm = False
    alpha = 0.3
    linewidth = 0.9
    label_size = 18

    for original_data in original_data_list:
        if original_data.name == original or original_data.name == desired:
            for feature in first_three_features:
        
                # Convert the columns to numeric data types
                original_data[feature] = pd.to_numeric(original_data[feature])
                counterfactuals[feature] = pd.to_numeric(counterfactuals[feature])

                # Extract the feature lists from the DataFrames
                original_feature = pd.DataFrame(original_data[feature])
                original_feature["TUMOR_TYPE"] = f'Original'
                counterfactual_feature = pd.DataFrame(counterfactuals[feature])
                counterfactual_feature["TUMOR_TYPE"] = f'Generated'
            
                # Concatenate the DataFrames
                original_feature = pd.concat([original_feature, counterfactual_feature])
                original_feature.reset_index(inplace=True)
                original_feature = original_feature.rename(columns={'TUMOR_TYPE': 'Data'})



                ### KDE ###
                if original_data.name == desired:
                    # Save the results
                    filename_pdf = f'{root_dir}/figures/distributions/{original}_to_{desired}-vs-{desired}_{feature}.pdf'
                    filename_png = f'{root_dir}/figures/distributions/{original}_to_{desired}-vs-{desired}_{feature}.png'

                    print(f"Plotting {desired} {feature} KDE")
                    print(original_feature.head())
                    plt.figure(dpi=300)
                    sn.set_style('dark', {'axes.grid' : False})
                    with sn.plotting_context(rc={"axes.labelsize":label_size}):
                        sns_hist = sn.kdeplot(
                        data=original_feature, x=feature, hue="Data",
                        fill=True, palette=palette, common_norm=common_norm, alpha=alpha, linewidth=linewidth
                        )
                    x = sns_hist.get_figure()
                    plt.tight_layout()
                    x.savefig(filename_pdf)
                    x.savefig(filename_png)

In [None]:
def get_best_features(root_dir, original, desired):
    file_path = f'{root_dir}/counterfactuals/statistics_of_cfes/{original}/{original}_to_{desired}_summary.txt'

    with open(file_path, 'r') as file:
        file_content = file.read()

    # Extract the max counted 3 features using regex
    pattern = r"^(.+?): \d+ changes$"  # Regex pattern to match the feature lines
    matches = re.findall(pattern, file_content, flags=re.MULTILINE)

    # We have already sorted the features by their counts in the file, so
    # We can extract the max counted 3 feature names as a list
    max_counted_3_features = matches[:3]

    return max_counted_3_features

In [None]:
original_class = ["MB", "EP", "PA", "BG"]
desired_class = ["MB", "EP", "PA", "BG"]

for original in original_class:

    for desired in desired_class:
        read_file = f'{root_dir}/counterfactuals/formatted_cfe_from_json/{original}/{original}_to_{desired}_merged_data.xlsx'
        #write_file = f'{root_dir}/counterfactuals/t-tests_of_cfes/{original}/{original}_to_{desired}.xlsx'

        # Define the columns of interest
        columns_of_interest = ['T2_Tumor', 'T2_Parenchyma', 'T2_Ratio', 'FLAIR_Tumor', 'FLAIR_Parenchyma',
                                'FLAIR_Ratio', 'DWI_Tumor', 'DWI_Parenchyma', 'DWI_Ratio', 'ADC_Tumor',
                                'ADC_Parenchyma', 'ADC_Ratio', 'T1_Tumor', 'T1_Parenchyma', 'T1_Ratio',
                                    'T1CE_Tumor', 'T1CE_Parenchyma', 'T1CE_Ratio', 'TUMOR_TYPE', 'Baseline']

        # Read the Excel file
        df = pd.read_excel(read_file, header=0)
        df = change_labels(df)
        cfes = pd.DataFrame(columns=columns_of_interest[:-1])

        # Create a new column to track the baseline rows
        df['Baseline'] = False

        # Create a dictionary to store the change counts for each column
        change_counts = {column: 0 for column in columns_of_interest}

        # Initialize the pattern count
        pattern_count = 0

        # Iterate over the rows
        for i, row in df.iterrows():
            # Check if the row is a baseline row
            if i % 6 == 0:
                baseline_row = row
                df.at[i, 'Baseline'] = True
                pattern_count += 1
            else:
                # Iterate over the columns
                # Check if the tumor type is classifed as the original class by ml model
                # If so, add the row to the cfes
                for column in df.columns:
                    if column == "TUMOR_TYPE":
                        if baseline_row[column] == original:
                            cfes = pd.concat([cfes, pd.DataFrame([row], columns=cfes.columns)], ignore_index=True)
                

        # Save the updated data to a new Excel file
        #cfes.to_excel(write_file, index=False)

        best_features = get_best_features(root_dir, original, desired)

        original_data_list = [filtered_mb, filtered_ep, filtered_pa, filtered_bg]

        get_gaussian_dists(root_dir, original_data_list, cfes, original, desired, best_features)