In [None]:
import pandas as pd
import re
from scipy import stats
import os

root_dir = "/your/path"

In [None]:
try:
    os.mkdir(f"{root_dir}/counterfactuals/3-ttests_only_changed_cfes")
except OSError as error: 
    pass 

In [None]:
def get_original_features(root_dir):
    dataset = pd.read_excel(f"{root_dir}/data/file.xlsx")
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "MB" if x == "MEDULLOBLASTOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "EP" if x == "EPENDYMOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "PA" if x == "PILOCYTIC ASTROCYTOMA" else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "BG" if x == "GLIOMA" else x)
    filtered_mb = dataset[dataset["TUMOR_TYPE"] == "MB"].head(25)
    filtered_ep = dataset[dataset["TUMOR_TYPE"] == "EP"]
    filtered_pa = dataset[dataset["TUMOR_TYPE"] == "PA"].head(25)
    filtered_bg = dataset[dataset["TUMOR_TYPE"] == "BG"].head(25)
    return filtered_mb, filtered_ep, filtered_pa, filtered_bg

In [None]:
filtered_mb, filtered_ep, filtered_pa, filtered_bg = get_original_features(root_dir)
filtered_mb.name = "MB"
filtered_ep.name = "EP"
filtered_pa.name = "PA"
filtered_bg.name = "BG"

In [None]:
def change_labels(dataset):
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "MB" if x == 0 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "EP" if x == 1 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "PA" if x == 2 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "BG" if x == 3 else x)
    return dataset

In [None]:
def get_best_features(root_dir, original, desired):
    file_path = f'{root_dir}/counterfactuals/statistics_of_cfes/{original}/{original}_to_{desired}_summary.txt'

    with open(file_path, 'r') as file:
        file_content = file.read()

    # Extract the max counted 3 features using regex
    pattern = r"^(.+?): \d+ changes$"  # Regex pattern to match the feature lines
    matches = re.findall(pattern, file_content, flags=re.MULTILINE)

    # We have already sorted the features by their counts in the file, so
    # We can extract the max counted 3 feature names as a list
    max_counted_3_features = matches[:3]

    return max_counted_3_features

In [None]:
def perform_ttest(root_dir, original_data_list, counterfactuals, original, desired, first_three_features):
    results_df = pd.DataFrame(columns=['MRI Feature', 'Original Case', 'Generated Case', 'T-Statistic', 'P-Value', 'Statistical Significance'])

    for original_data in original_data_list:
        if original_data.name == original or original_data.name == desired:
            for feature in first_three_features:
                # Remove rows with '-' values from both DataFrames
                cleaned_original_data = original_data[original_data[feature] != '-']
                cleaned_counterfactuals = counterfactuals[counterfactuals[feature] != '-']

                # Convert the columns to numeric data types
                cleaned_original_data[feature] = pd.to_numeric(cleaned_original_data[feature])
                cleaned_counterfactuals[feature] = pd.to_numeric(cleaned_counterfactuals[feature])

                # Extract the feature lists from the DataFrames
                original_feature = cleaned_original_data[feature]
                counterfactual_feature = cleaned_counterfactuals[feature]

                # Perform a two-tailed t-test independent
                if original_data.name == desired:
                    t_statistic, p_value = stats.ttest_ind(original_feature, counterfactual_feature, equal_var=False)
                    p_value_str = "<0.0001" if p_value < 0.0001 else f"{p_value:.3f}"

                    # Print the results
                    result = f"T-Statistic for {feature}: {t_statistic}\n"
                    result += f"P-Value for {feature}: {p_value_str}\n"

                    if p_value < 0.05:
                        result += f"The difference in {feature} between original {original_data.name} and generated {original} to {desired} CFE populations is statistically significant.\n"
                    else:
                        result += f"There is no statistically significant difference in {feature} between original {original_data.name} and generated {original} to {desired} CFE populations.\n"

                    result += "\n"
                    # Save the results to a text file
                    filename_txt = f'{root_dir}/counterfactuals/3-ttests_only_changed_cfes/CFE_{desired}.txt'
                    with open(filename_txt, 'a') as file:
                        file.write(result)

                    # Create a dictionary with the result information
                    result_dict = {
                        'MRI Feature': feature,
                        'Original Case': f"{original_data.name}",
                        'Generated Case': f"{original} to {desired}",
                        'T-Statistic': t_statistic,
                        'P-Value': p_value_str,
                        'Statistical Significance': "Statistically significant" if p_value < 0.05 else "Not statistically significant"
                    }
                    # Append the result to the DataFrame
                    results_df = results_df.append(result_dict, ignore_index=True)

    # Save the results to a CSV file
    filename_excel = f'{root_dir}/counterfactuals/3-ttests_only_changed_cfes/CFE_{desired}_testNo3.xlsx'
    if os.path.isfile(filename_excel):
        existing_data = pd.read_excel(filename_excel)
        results_df = pd.concat([existing_data, results_df], ignore_index=True)
    results_df.to_excel(filename_excel, index=False)


In [None]:
original_class = ["MB", "EP", "PA", "BG"]
desired_class = ["MB", "EP", "PA", "BG"]

for original in original_class:

    try:
        os.mkdir(f"{root_dir}/counterfactuals/3-ttests_only_changed_cfes/{original}")
    except OSError as error: 
        pass 

    for desired in desired_class:
        read_file = f'{root_dir}/counterfactuals/statistics_of_cfes/{original}/{original}_to_{desired}.xlsx'
        write_file = f'{root_dir}/counterfactuals/3-ttests_only_changed_cfes/{original}/{original}_to_{desired}.xlsx'

        # Define the columns of interest
        columns_of_interest = ['T2_Tumor', 'T2_Parenchyma', 'T2_Ratio', 'FLAIR_Tumor', 'FLAIR_Parenchyma',
                                'FLAIR_Ratio', 'DWI_Tumor', 'DWI_Parenchyma', 'DWI_Ratio', 'ADC_Tumor',
                                'ADC_Parenchyma', 'ADC_Ratio', 'T1_Tumor', 'T1_Parenchyma', 'T1_Ratio',
                                    'T1CE_Tumor', 'T1CE_Parenchyma', 'T1CE_Ratio', 'TUMOR_TYPE', 'Baseline']

        # Read the Excel file
        df = pd.read_excel(read_file, header=0)
        df = change_labels(df)
        cfes = pd.DataFrame(columns=columns_of_interest[:-1])

        # Create a new column to track the baseline rows
        df['Baseline'] = False

        # Create a dictionary to store the change counts for each column
        change_counts = {column: 0 for column in columns_of_interest}

        # Initialize the pattern count
        pattern_count = 0

        # Iterate over the rows
        for i, row in df.iterrows():
            # Check if the row is a baseline row
            if i % 6 == 0:
                baseline_row = row
                df.at[i, 'Baseline'] = True
                pattern_count += 1
            else:
                # Iterate over the columns
                # Check if the tumor type is classifed as the original class by ml model
                # If so, add the row to the cfes
                for column in df.columns:
                    if column == "TUMOR_TYPE":
                        if baseline_row[column] == original:
                            cfes = pd.concat([cfes, pd.DataFrame([row], columns=cfes.columns)], ignore_index=True)
                

        # Save the updated data to a new Excel file
        cfes.to_excel(write_file, index=False)

        best_features = get_best_features(root_dir, original, desired)

        original_data_list = [filtered_mb, filtered_ep, filtered_pa, filtered_bg]

        # Perform a two-tailed t-test
        result = perform_ttest(root_dir, original_data_list, cfes, original, desired, best_features)
