one vs its cfes individually

In [None]:
import pandas as pd
import numpy as np
import re
from scipy import stats
import os

root_dir = "/your/path"

In [None]:
try:
    os.mkdir(f"{root_dir}/counterfactuals/1-ttests_of_cfes")
except OSError as error: 
    pass

In [None]:
def change_labels(dataset):
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "MB" if x == 0 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "EP" if x == 1 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "PA" if x == 2 else x)
    dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "BG" if x == 3 else x)
    return dataset

In [None]:
def get_best_features(root_dir, original, desired):
    file_path = f'{root_dir}/counterfactuals/statistics_of_cfes/{original}/{original}_to_{desired}_summary.txt'

    with open(file_path, 'r') as file:
        file_content = file.read()

    # Extract the max counted 5 features using regex
    pattern = r"^(.+?): \d+ changes$"  # Regex pattern to match the feature lines
    matches = re.findall(pattern, file_content, flags=re.MULTILINE)

    # We have already sorted the features by their counts in the file, so
    # We can extract the max counted 5 feature names as a list
    max_counted_5_features = matches[:5]

    return max_counted_5_features

In [None]:
import openpyxl
from openpyxl.styles import Alignment

def adjust_column_width_and_center(excel_file):
    # Load the workbook
    workbook = openpyxl.load_workbook(excel_file)

    # Iterate over each sheet in the workbook
    for sheet_name in workbook.sheetnames:
        worksheet = workbook[sheet_name]

        # Iterate over each column in the sheet
        for column in worksheet.columns:
            max_length = 0

            # Find the maximum length of the cell values in the column
            for cell in column:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(str(cell.value))
                except:
                    pass

            # Adjust the column width based on the maximum length
            adjusted_width = (max_length + 2) * 1.2
            column_letter = column[0].column_letter
            worksheet.column_dimensions[column_letter].width = adjusted_width

            for cell in column:
                cell.alignment = openpyxl.styles.Alignment(horizontal='center', vertical='center', wrap_text=True)

    # Save the modified workbook
    workbook.save(excel_file)


In [None]:
def perform_ttest(root_dir, original_data, counterfactuals, original, desired):
    results_df = pd.DataFrame(columns=['Original Case', 'Generated Case', 'Features', 'T-Statistic', 'P-Value', 'Statistical Significance'])

    # Initialize the pattern count
    not_significant_count = 0
    significant_count = 0
    best_features = get_best_features(root_dir, original, desired)

    # Extract the max counted 5 features from the list
    best_original_data = original_data[best_features].to_numpy(dtype=np.float64)
    best_counterfactuals = counterfactuals[best_features].to_numpy(dtype=np.float64)

    if original != desired:
        for counterfactual in best_counterfactuals:
            # Perform the t-test
            print(f"Original {original} and generated {original} to {desired} CFE populations are being tested...")
            print(f"best_original_data: {best_original_data}")
            print(f"counterfactual: {counterfactual}")
            t_statistic, p_value = stats.ttest_rel(best_original_data, counterfactual)

            # Print the results
            result = f"T-Statistic: {t_statistic}\n"
            result += f"P-Value: {p_value}\n"

            if p_value < 0.05:
                result += f"The difference between original {original} and generated {original} to {desired} CFE populations is statistically significant.\n"
                significant_count += 1
            else:
                result += f"There is no statistically significant difference between original {original} and generated {original} to {desired} CFE populations.\n"
                not_significant_count += 1

            # Save the results to a text file
            filename_txt = f'{root_dir}/counterfactuals/1-ttests_of_cfes/CFE_{desired}.txt'
            with open(filename_txt, 'a') as file:
                file.write(result)
                file.write("Significance Counts:\n")
                file.write(f"Significant: {significant_count}\n")
                file.write(f"Not Significant: {not_significant_count}\n\n")


            # Create a dictionary with the result information
            result_dict = {
                'Original Case': original,
                'Generated Case': f"{original} to {desired}",
                'Features': best_features,
                'T-Statistic': t_statistic,
                'P-Value': p_value,
                'Statistical Significance': "Statistically significant" if p_value < 0.05 else "Not statistically significant",
                'Significant': significant_count,
                'Not Significant': not_significant_count
            }

            # Append the result to the DataFrame
            results_df = results_df.append(result_dict, ignore_index=True)

        # Save the results to an Excel file
        filename_excel = f'{root_dir}/counterfactuals/1-ttests_of_cfes/CFE_{original}_to_{desired}.xlsx'
        if os.path.isfile(filename_excel):
            existing_data = pd.read_excel(filename_excel)
            results_df = pd.concat([existing_data, results_df], ignore_index=True)
        results_df.to_excel(filename_excel, index=False)
        adjust_column_width_and_center(filename_excel)
    

In [None]:
original_class = ["MB", "EP", "PA", "BG"]
desired_class = ["MB", "EP", "PA", "BG"]


for original in original_class:
    for desired in desired_class:
        read_file = f'{root_dir}/counterfactuals/formatted_cfe_from_json/{original}/{original}_to_{desired}_merged_data.xlsx'
        write_file = f'{root_dir}/counterfactuals/1-ttests_of_cfes/{original}/{original}_to_{desired}.xlsx'

        # Read the Excel file
        df = pd.read_excel(read_file)
        df = change_labels(df)

        # Create a new DataFrame to store the baseline row
        baseline_row = pd.DataFrame(columns=df.columns)

        # Create a new DataFrame to store the counterfactuals
        cfe_rows = pd.DataFrame(columns=df.columns)
        
        # Iterate over the rows
        for i, row in df.iterrows():
            # Check if the row is a baseline row
            if i % 6 == 0:
                if row['TUMOR_TYPE'] == original:
                    baseline_row = df.iloc[i, :-1]
                    cfe_rows = df.iloc[i+1:i+6, :-1]
                    perform_ttest(root_dir, baseline_row, cfe_rows, original, desired)