# Init

In [1]:
import os
import json
import pandas as pd
from scipy.stats import ttest_ind, f_oneway, chi2_contingency
import statsmodels.api as sm
from statsmodels.formula.api import ols
from itertools import combinations, permutations
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests
from const import Const

# --- Setup ---
output_dir = "..results/latex_tables"
os.makedirs(output_dir, exist_ok=True)

# delete all files in folder
for file in os.listdir(output_dir):
    os.remove(os.path.join(output_dir, file))

json_dir = ''

if json_dir == '':
    raise ValueError("please set json_dir")

# --- Load data ---
with open('json_dir', 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data['num_faces']['combined'])

# --- Normalize max_depth ---
scaler = MinMaxScaler()
df[Const.mean_depth] = scaler.fit_transform(df[[Const.mean_depth]])


# Zone classification (3x3 grid)
df['x_zone'] = pd.cut(df['face_center_x'], bins=3, labels=['left','center','right'])
df['y_zone'] = pd.cut(df['face_center_y'], bins=3, labels=['top','middle','bottom'])
df['grid_zone'] = df['y_zone'].astype(str) + '-' + df['x_zone'].astype(str)

categorical_features = [Const.gender, Const.race, Const.age_range, Const.emotion]
continuous_features = [Const.centrality, Const.mean_depth, Const.face_center_y]

ValueError: please set json_dir

In [None]:
num_img_file = {}

def save_latex_table(df, caption, label, col_format=None, bold_header=True, filename="table.tex", mode='w', all=False, img_label='', latex_img_filename='', figure_caption = ''):
    
    if latex_img_filename not in num_img_file:
        num_img_file[latex_img_filename] = {}
    
    if img_label not in num_img_file[latex_img_filename]:
        num_img_file[latex_img_filename][img_label] = 0

    """Save DataFrame to LaTeX table in tabularx format."""
    num_data_cols = len(df.columns)
    if col_format is None:
        # Use a fixed width for data columns if no specific format is given
        col_format = "| l | " + "X" * num_data_cols + ' |'# Example fixed width
        # Ensure caption and label are valid LaTeX strings (e.g., no underscores unless escaped)
    caption = caption.replace('_', ' ')
    label = label.replace('_', '')

    latex_str = f"\\begin{{table}}[caption={{{caption}}}, label={label}]\n"
    latex_str += "\t\\centering\n"
    latex_str += f"\t\\begin{{tabularx}}{{\\textwidth}}{{{col_format}}}\n"
    latex_str += "\t\t \\hline\n"

    # Column headers
    if bold_header:
        header_row = ["\\bf " + str(c) for c in df.columns] # Keep original underscores in headers
    else:
        header_row = [str(c) for c in df.columns] # Keep original underscores in headers
    latex_str += "\t\t   & " + " & ".join(header_row) + " \\\\ \n"
    latex_str += "         \\hline\n"

    # Rows
    for idx in df.index:
        row_name = f"\\bf {str(idx)}" if bold_header else str(idx) # Keep original underscores in row names
        values = " & ".join([f"{val:.2f}" if isinstance(val, (int, float)) else str(val).replace('_', ' ') for val in df.loc[idx]]) # Replace underscores in values if they are strings, format floats to 4 decimal places
        latex_str += f"         {row_name} & {values}\\\\\n"

    latex_str += "         \\hline\n"
    latex_str += "\t\\end{tabularx}\n"
    latex_str += "\\end{table}\n"

    with open(os.path.join(output_dir, filename), mode) as f:
        f.write(latex_str.replace('_', ''))
        if mode == 'a':
          f.write("\n\n") # Add newlines between tables when appending

    

    
    # Save table as image with a unique filename based on the label
    image_filename = os.path.join(output_dir, f"{label.replace(':', '_')}.svg")
    # Calculate figure size dynamically
    rows, cols = df.shape
    # Adjust these factors based on desired spacing and font size
    row_height = 0.2  # Approximate height of each row
    col_width = 1.2   # Approximate width of each column
    fig_height = max(2, rows * row_height) # Minimum height of 2 inches
    fig_width = max(5, cols * col_width)   # Minimum width of 5 inches

    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
    ax.set_facecolor('#00000000')
    fig.patch.set_facecolor('#00000000')
    ax.axis('off')

    # Format cell text to 2 decimal places for floats
    cell_text = []
    for row in df.values:
        cell_text.append([f"{x:.2f}" if isinstance(x, (int, float)) else str(x).replace('_', ' ') for x in row])


    table = ax.table(cellText=cell_text, colLabels=df.columns, loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    table.auto_set_column_width(list(range(len(df.columns))))


    plt.tight_layout()
    plt.savefig(image_filename, bbox_inches='tight')
    plt.close()

    img_latex_str = ''
# [label={{fig:{label}:{num_img_file[img_label]}}}
    if len(num_img_file[latex_img_filename]) > 1 and num_img_file[latex_img_filename][img_label] == 0:
        img_latex_str += "\\end{figure}\n\n\n"

    if num_img_file[latex_img_filename][img_label] == 0:
        img_latex_str += f"\\begin{{figure}}[label={{{img_label}}}, caption={{{caption}}}]\n\t\\centering\n"
    
    if all:
        img_latex_str += "\t\\begin{subfigure}{0.3\\textwidth}\n" # Use textwidth for single column figures
    else:
        img_latex_str += "\t\\begin{subfigure}{0.48\\textwidth}\n" # Use textwidth for single column figures
    if '.svg' in image_filename:
        img_latex_str += f"\t\t\\includesvg[width=\\textwidth]{{{os.path.join('figures/results/', image_filename)}}}\n" # Include path to image
    else:
        img_latex_str += f"\t\t\\includegraphics[width=\\textwidth]{{{os.path.join('figures/results/', image_filename)}}}\n" # Include path to image
    img_latex_str += f"\t\t\\caption{{{figure_caption}}}\n"
    img_latex_str += f"\t\t\\label{{fig:{label}:{num_img_file[latex_img_filename][img_label]}}}\n"
    img_latex_str += "\t\\end{subfigure}\n"

    if num_img_file[latex_img_filename][img_label] % 3 != 0 and all:
        img_latex_str += '\t\\hfill\n'
    if num_img_file[latex_img_filename][img_label] % 2 != 0 and not all:
        img_latex_str += '\t\\hfill\n'


    with open(os.path.join(output_dir, latex_img_filename), mode) as f:
        f.write(img_latex_str.replace('_', ''))

    num_img_file[latex_img_filename][img_label] += 1








In [None]:
# ============== Two way =======================

all_results_filename = 'twoway_cor_all.tex'
relevant_results_filename = 'twoway_cor_relevant.tex'
all_results_img_filename = f'Twoway_figures_all.tex'
relevant_results_img_filename = f'Twoway_figures_sig.tex'

# --- Two-Way Correlations ---
subsection_latex = "\\subsection{Two-Way Correlations}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)

for combo in permutations(categorical_features, 2):
    f1, f2 = combo
    crosstab_percentage = pd.crosstab(df[f1], df[f2], normalize='index') * 100
    if {f1, f2} == {"emotion", "race"}:
        desired_rows = ["Angry", "Disgust", "Fear", "Happy", "Neutral", "Sad", "Surprise"]
        desired_cols = ["Asian", "Black", "Indian", "Latino Hispanic", "Middle Eastern", "White"]
        crosstab_percentage = crosstab_percentage.reindex(index=desired_rows, columns=desired_cols)

    col_format = "| l | " + "X" * len(crosstab_percentage.columns) + ' |'

    subsection_title = f"{f1.title()} and {f2.title()}"
    subsection_latex = f"\\subsubsection{{{subsection_title.replace('_', ' ')}}}\n\n"



    with open(os.path.join(output_dir, all_results_filename), 'a') as f:
        f.write(subsection_latex)
    save_latex_table(
        crosstab_percentage.round(2),
        caption=f"All Correlated Results - {f1.title()} & {f2.title()}",
        label=f"tab:corstat:{f1}{f2}",
        filename=all_results_filename,
        mode='a',
        col_format=col_format,
        img_label=f"fig:corstat",
        latex_img_filename=all_results_img_filename,
        figure_caption=f"All Correlated Results - {f1.title()} & {f2.title()}",
        all=True
    )


    # For relevant results, we'll include all correlation tables as they are descriptive
    with open(os.path.join(output_dir, relevant_results_filename), 'a') as f:
        f.write(subsection_latex)
    save_latex_table(
        crosstab_percentage.round(2),
        caption=f"Significant Correlated Results - {f1.title()} & {f2.title()}",
        label=f"tab:corstatsig:{f1}{f2}",
        filename=relevant_results_filename,
        mode='a',
        col_format=col_format,
        img_label=f"fig:corstatsig",
        latex_img_filename=relevant_results_img_filename,
        figure_caption=f"Significant Correlated Results - {f1.title()} & {f2.title()}"
    )

with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')


In [None]:
# ============== Three way =======================

all_results_filename = 'threeway_cor_all.tex'
relevant_results_filename = 'threeway_cor_relevant.tex'
all_results_img_filename = f'threeway_fig_all.tex'
relevant_results_img_filename = f'threeway_fig_relevant.tex'


subsection_latex = "\\subsection{Three-Way Correlations}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_filename), 'w') as f:
    f.write(subsection_latex)

with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)

for i, combo in enumerate(permutations(categorical_features, 3)):
    f1, f2, f3 = combo
    grouped_percentage = df.groupby([f1, f2])[f3].value_counts(normalize=True).unstack().fillna(0) * 100

    col_format = "| l | " + "X" * len(grouped_percentage.columns) + ' |'


    subsection_title = f"{f1.title()}, {f2.title()}, {f3.title()}"
    subsection_latex = f"\\subsubsection{{{subsection_title.replace('_', ' ')}}}\n\n"

    with open(os.path.join(output_dir, all_results_filename), 'a') as f:
        f.write(subsection_latex)
    save_latex_table(
        grouped_percentage.round(2),
        caption=f"All Three-way Correlation: {f1.title()}, {f2.title()}, {f3.title()}",
        label=f"tab:corstat3:{f1}{f2}{f3}",
        filename=all_results_filename,
        mode='a', # Always append after the first table
        col_format=col_format,
        img_label=f"fig:corstat3",
        latex_img_filename=all_results_img_filename,
        figure_caption=f"All Three-way Correlation: {f1.title()}, {f2.title()}, {f3.title()}",
        all=True
    )

    # For relevant results, we'll include all correlation tables as they are descriptive
    with open(os.path.join(output_dir, relevant_results_filename), 'a') as f:
        f.write(subsection_latex)
    save_latex_table(
        grouped_percentage.round(2),
        caption=f"Significant Three-way Correlation: {f1.title()}, {f2.title()}, {f3.title()}",
        label=f"tab:corstat3sig:{f1}{f2}{f3}",
        filename=relevant_results_filename,
        mode='a', # Always append after the first table
        col_format=col_format,
        img_label=f"fig:corstat3sig",
        latex_img_filename=relevant_results_img_filename,
        figure_caption=f"Significant All Three-way Correlation: {f1.title()}, {f2.title()}, {f3.title()}",
    )


with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')

In [None]:
# --- T-tests ---
t_results = {Const.centrality:{}, Const.mean_depth: {} , Const.face_center_y : {}}
significant_t_results = {Const.centrality:{}, Const.mean_depth: {}, Const.face_center_y : {}}

all_results_filename = 'ttest_all.tex'
relevant_results_filename = 'ttest_relevant.tex'
all_results_img_filename = f'ttest_figure_all.tex'
relevant_results_img_filename = f'ttest_figure_relevant.tex'


section_latex = "\\section{T-tests}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'w') as f:
    f.write(section_latex)
with open(os.path.join(output_dir, relevant_results_filename), 'w') as f:
    f.write(section_latex)

with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)


for cat_feature in categorical_features:
    unique_values = df[cat_feature].unique()
    for val1, val2 in combinations(unique_values, 2):
        for cont_feature in continuous_features:
            g1 = df[df[cat_feature] == val1][cont_feature].dropna() # Added dropna() to handle missing values
            g2 = df[df[cat_feature] == val2][cont_feature].dropna() # Added dropna() to handle missing values
            if len(g1) > 1 and len(g2) > 1:
                ttest_result = ttest_ind(g1, g2)
                result_row = [cat_feature, val1, val2, cont_feature, g1.mean(), g2.mean(), ttest_result.statistic, ttest_result.pvalue] # Added means to the result row
                if cat_feature not in t_results[cont_feature]:
                    t_results[cont_feature][cat_feature] = []
                t_results[cont_feature][cat_feature].append(result_row)
                if ttest_result.pvalue < 0.05: # Assuming significance level of 0.05
                     if cat_feature not in significant_t_results[cont_feature]:
                        significant_t_results[cont_feature][cat_feature] = []
                     significant_t_results[cont_feature][cat_feature].append(result_row)

for key in t_results:
    subsection_latex = f"\\subsection{{{key.replace('_', ' ').title()}}}\n\n"

    with open(os.path.join(output_dir, all_results_filename), 'a') as f:
        f.write(subsection_latex)

    with open(os.path.join(output_dir, relevant_results_filename), 'a') as f:
        f.write(subsection_latex)

    for cat in categorical_features:
        subsubsection_latex = f"\\subsubsection{{{cat.replace('_', ' ').title()}}}\n\n"

        with open(os.path.join(output_dir, all_results_filename), 'a') as f:
            f.write(subsubsection_latex)
        
        t_df = pd.DataFrame(t_results[key][cat], columns=["Category", "Group1", "Group2", "Measure", "Mean1", "Mean2", "t-stat", "p-value"])
        save_latex_table(
            t_df,
            caption=f"Full t-test Results - {key.replace('_', ' ').title()} for {cat.replace('_', '').title()}",
            label=f"tab:ttests:{key.replace('_', '').title()}_{cat.replace('_', '').title()}",
            bold_header=True,
            filename=all_results_filename,
            mode='a',
            img_label=f"fig:ttests",
            latex_img_filename=all_results_img_filename,
            figure_caption=f"Full t-test Results - {key.replace('_', ' ').title()} for {cat.replace('_', '').title()}",
            all=True)

        if cat in significant_t_results[key]:

            with open(os.path.join(output_dir, relevant_results_filename), 'a') as f:
                f.write(subsubsection_latex)
            significant_t_df = pd.DataFrame(significant_t_results[key][cat], columns=["Category", "Group1", "Group2", "Measure", "Mean1", "Mean2", "t-stat", "p-value"]) # Updated columns
            save_latex_table(
                significant_t_df,
                caption=f"Significant T-test Results - {key.replace('_', ' ').title()} for {cat.replace('_', '').title()}",
                label=f"tab:ttestssig:_{key.replace('_', '').title()}_{cat.replace('_', '').title()}",
                bold_header=True,
                filename=relevant_results_filename,
                mode='a',
                img_label=f"fig:ttestssig",
                latex_img_filename=relevant_results_img_filename,
                figure_caption=f"Significant t-test Results - {key.replace('_', ' ').title()} for {cat.replace('_', '').title()}",
                )
            

with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')

In [None]:
# --- One-Way ANOVA ---
anova_results = []
significant_anova_results = []

all_results_filename = 'anova_one_all.tex'
relevant_results_filename = 'anova_one_relevant.tex'

all_results_img_filename = f'anova_one_figure_all.tex'
relevant_results_img_filename = f'anova_one_figure_relevant.tex'

subsection_latex = "\\subsection{One-Way ANOVA}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_filename), 'w') as f:
    f.write(subsection_latex)

with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)


for cat_feature in categorical_features:
    for cont_feature in continuous_features:
        groups = [df[cont_feature][df[cat_feature] == val].dropna() for val in df[cat_feature].unique()]
        if all(len(g) > 1 for g in groups) and len(groups) > 1:
            f_statistic, p_value = f_oneway(*groups)
            result_row = [cat_feature, cont_feature, f_statistic, p_value]
            anova_results.append(result_row)
            if p_value < 0.05: # Assuming significance level of 0.05
                significant_anova_results.append(result_row)

anova_df = pd.DataFrame(anova_results, columns=["Category", "Measure", "F-stat", "p-value"])
save_latex_table(
    anova_df,
     caption="Full One-Way ANOVA Results", 
     label="tab:anova1", 
     filename=all_results_filename, 
     mode='a',
    img_label=f"fig:anova1",
    latex_img_filename=all_results_img_filename,
    figure_caption="Full One-Way ANOVA Results",
    all=True)

if significant_anova_results:
    significant_anova_df = pd.DataFrame(significant_anova_results, columns=["Category", "Measure", "F-stat", "p-value"])
    save_latex_table(
        significant_anova_df, 
        caption="Significant One-Way ANOVA Results", 
        label="tab:anova1_sig", 
        filename=relevant_results_filename, 
        mode='a',
        img_label=f"fig:anova1_sig",
        latex_img_filename=relevant_results_img_filename,
        figure_caption="Significant One-Way ANOVA Results")

with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')

In [None]:
# --- Two-Way ANOVA ---
all_results_filename = 'anova_two_all.tex'
relevant_results_filename = 'anova_two_relevant.tex'

all_results_img_filename = f'anova_two_figure_all.tex'
relevant_results_img_filename = f'anova_two_figure_relevant.tex'

subsection_latex = "\\subsection{Two-Way ANOVA}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_filename), 'w') as f:
    f.write(subsection_latex)

with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)

for combo in combinations(categorical_features, 2):
    f1, f2 = combo
    for cont_feature in continuous_features:
        formula = f"{cont_feature} ~ C({f1}) * C({f2})"
        if df.groupby([f1, f2]).size().min() > 1:
            model = ols(formula, data=df).fit()
            anova_table = sm.stats.anova_lm(model, typ=2)
            anova_table = anova_table.reset_index().rename(columns={"index": "Source"})

            subsubsection_latex = f"\\subsubsection{{{f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {cont_feature.replace('_', ' ').title()}}}\n\n"
            with open(os.path.join(output_dir, all_results_filename), 'a') as f:
                f.write(subsubsection_latex)
            save_latex_table(
                anova_table.round(4),
                caption=f"Full two-Way ANOVA: {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {cont_feature.replace('_', ' ').title()}",
                label=f"tab:anova2:{f1.replace('_', '')}{f2.replace('_', '')}{cont_feature.replace('_', '')}",
                filename=all_results_filename,
                mode='a',
                img_label=f"fig:anova2",
                latex_img_filename=all_results_img_filename,
                figure_caption=f"Full two-Way ANOVA: {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {cont_feature.replace('_', ' ').title()}",
                all=True
            )

            # Check for significant interaction term (p-value < 0.05)
            interaction_p_value = anova_table[anova_table['Source'] == f'C({f1}):C({f2})']['PR(>F)'].iloc[0]
            if interaction_p_value < 0.05:
                with open(os.path.join(output_dir, relevant_results_filename), 'a') as f:
                    f.write(subsubsection_latex)
                save_latex_table(
                    anova_table.round(4), # Save the full table if interaction is significant
                    caption=f"Two-Way ANOVA: {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {cont_feature.replace('_', ' ').title()} (Significant Interaction)",
                    label=f"tab:anova2sig:int{f1.replace('_', '')}{f2.replace('_', '')}{cont_feature.replace('_', ' ')}",
                    filename=relevant_results_filename,
                    mode='a',
                    img_label=f"fig:anova2sig",
                    latex_img_filename=relevant_results_img_filename,
                    figure_caption=f"Two-Way ANOVA: {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {cont_feature.replace('_', ' ').title()} (Significant Interaction)",
                )

with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')

In [None]:
# --- 1. Spatial bias ---

all_results_filename = 'spatial_all.tex'
relevant_results_filename = 'spatial_relevant.tex'

all_results_img_filename = f'spatial_figure_all.tex'
relevant_results_img_filename = f'spatial_figure_relevant.tex'

subsection_latex = "\\subsection*{Centrality Bias by Demographic}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'a') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_filename), 'a') as f:
    f.write(subsection_latex)

with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)


for cont in continuous_features:
    centrality_results_all = []
    centrality_results_relevant = []
    for col in categorical_features:
        group_means = df.groupby(col)[cont].mean().sort_values()

        # ANOVA if >2 groups else t-test
        groups = [g[cont].values for _, g in df.groupby(col)]
        if len(groups) > 2:
            f_stat, p_val = f_oneway(*groups)
            
            result_row = [col, 'ANOVA', f_stat, p_val]
            centrality_results_all.append(result_row)
            if p_val < 0.05:
                centrality_results_relevant.append(result_row)

        elif len(groups) == 2:
            t_stat, p_val = ttest_ind(groups[0], groups[1])
            
            result_row = [col, 't-test', t_stat, p_val]
            centrality_results_all.append(result_row)
            if p_val < 0.05:
                centrality_results_relevant.append(result_row)

    # Save centrality results to LaTeX tables
    if centrality_results_all:
        centrality_df_all = pd.DataFrame(centrality_results_all, columns=["Demographic", "Test", "Statistic", "p-value"])
        save_latex_table(
            centrality_df_all.round(4), 
            caption=f"{cont.replace('_', ' ').title()} Bias Statistical Tests (All)", 
            label=f"tab:biasall:{cont.replace('_', '').title()}all", 
            filename=all_results_filename, 
            mode='a',
            img_label=f"fig:biasall",
            latex_img_filename=all_results_img_filename,
            figure_caption=f"{cont.replace('_', ' ').title()} Bias Statistical Tests (All)", 
            all=True)

    if centrality_results_relevant:
        centrality_df_relevant = pd.DataFrame(centrality_results_relevant, columns=["Demographic", "Test", "Statistic", "p-value"])
        save_latex_table(
            centrality_df_relevant.round(4), 
            caption=f"{cont.replace('_', ' ').title()} Bias Statistical Tests (Relevant)", 
            label=f"tab:biassig:{cont.replace('_', '').title()}relevant", 
            filename=relevant_results_filename, 
            mode='a',
            img_label=f"fig:biassig",
            latex_img_filename=relevant_results_img_filename,
            figure_caption=f"{cont.replace('_', ' ').title()} Bias Statistical Tests (Relevant)",)


        
with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')

In [None]:
# --- 2. Zone bias ---

all_results_filename = 'chi_all.tex'
relevant_results_filename = 'chi_relevant.tex'

all_results_img_filename = f'chi_figure_all.tex'
relevant_results_img_filename = f'chi_figure_relevant.tex'

zone_results_all = []
zone_results_relevant = []

subsection_latex = "\\subsection{Zone Bias (Chi-square)}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_filename), 'w') as f:
    f.write(subsection_latex)

with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)



for col in categorical_features:
    contingency = pd.crosstab(df[col], df['grid_zone'])
    chi2, p, dof, expected = chi2_contingency(contingency)
    result_row = [col, chi2, p, dof]
    zone_results_all.append(result_row)
    if p < 0.05:
        zone_results_relevant.append(result_row)


# Save zone bias results to LaTeX tables
if zone_results_all:
    zone_df_all = pd.DataFrame(zone_results_all, columns=["Demographic", "Chi2-stat", "p-value", "DOF"])
    save_latex_table(
        zone_df_all.round(4), 
        caption="Zone Bias Chi-square Tests (All)", 
        label="tab:zonebiasall", 
        filename=all_results_filename, 
        mode='a',
        img_label=f"fig:zonebiasall",
        latex_img_filename=all_results_img_filename,
        figure_caption="Zone Bias Chi-square Tests (All)", 
        all=True)

if zone_results_relevant:
    zone_df_relevant = pd.DataFrame(zone_results_relevant, columns=["Demographic", "Chi2-stat", "p-value", "DOF"])
    save_latex_table(
        zone_df_relevant.round(4), 
        caption="Zone Bias Chi-square Tests (Relevant)",
        label="tab:zonebiasrelevant", 
        filename=relevant_results_filename, 
        mode='a',
        img_label=f"fig:zonebiasrelevant",
        latex_img_filename=relevant_results_img_filename,
        figure_caption="Zone Bias Chi-square Tests (Relevant)")


with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')

In [None]:
# --- 3. Horizontal and vertical bias ---

all_results_filename = 'horizontal_all.tex'
relevant_results_filename = 'horizontal_relevant.tex'

all_results_img_filename = f'horizontal_figure_all.tex'
relevant_results_img_filename = f'horizontal_figure_all.tex'

pos_results_all = []
pos_results_relevant = []

subsection_latex = "\\subsection{Horizontal / Vertical Position Bias}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_filename), 'w') as f:
    f.write(subsection_latex)

with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)



for axis, axis_name in [('face_center_x','horizontal'), ('face_center_y','vertical')]:
    for col in ['gender','race','emotion','age_range']:
        groups = [g[axis].values for _, g in df.groupby(col)]
        if len(groups) > 2:
            f_stat, p_val = f_oneway(*groups)
            result_row = [axis_name, col, 'ANOVA', f_stat, p_val]
            pos_results_all.append(result_row)
            if p_val < 0.05:
                pos_results_relevant.append(result_row)
        elif len(groups) == 2:
            t_stat, p_val = ttest_ind(groups[0], groups[1])
            result_row = [axis_name, col, 't-test', t_stat, p_val]
            pos_results_all.append(result_row)
            if p_val < 0.05:
                pos_results_relevant.append(result_row)

# Save position bias results to LaTeX tables
if pos_results_all:
    pos_df_all = pd.DataFrame(pos_results_all, columns=["Axis", "Demographic", "Test", "Statistic", "p-value"])
    save_latex_table(
        pos_df_all.round(4), 
        caption="Position Bias Statistical Tests (All)", 
        label="tab:posbiasall", 
        filename=all_results_filename,
        mode='a',
        img_label=f"fig:posbiasall",
        latex_img_filename=all_results_img_filename,
        figure_caption="Position Bias Statistical Tests (All)",  
        all=True)

if pos_results_relevant:
    pos_df_relevant = pd.DataFrame(pos_results_relevant, columns=["Axis", "Demographic", "Test", "Statistic", "p-value"])
    save_latex_table(
        pos_df_relevant.round(4), 
        caption="Position Bias Statistical Tests (Relevant)", 
        label="tab:posbiasrelevant", 
        filename=relevant_results_filename, 
        mode='a',
        img_label=f"fig:posbiasrelevant",
        latex_img_filename=relevant_results_img_filename,
        figure_caption="Position Bias Statistical Tests (Relevant)", )

with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')

In [None]:
# --- 4. Heatmaps for each demographic ---

def plot_heatmap(subset, title):
    plt.figure(figsize=(8, 6))
    heatmap_data, xedges, yedges = np.histogram2d(
        subset['face_center_x'],
        subset['face_center_y'],
        bins=20
    )
    plt.imshow(heatmap_data.T, origin='lower', cmap='hot',
               extent=[0, df['face_center_x'].max(),
                       0, df['face_center_y'].max()]) # Use actual max values for extent
    plt.title(title)
    plt.xlabel('X position')
    plt.ylabel('Y position')
    plt.colorbar(label='Count')
    # Save heatmap as an image file
    plt.savefig(os.path.join(output_dir, f"heatmap_{title.replace(' ', '_').replace('=', '_')}.png"))
    plt.close() # Close the plot to free up memory

# Plot heatmaps for top groups in each demographic
for col in categorical_features:
    # Get all unique values for plotting all heatmaps
    unique_vals = df[col].unique()
    for val in unique_vals:
        subset = df[df[col] == val].dropna(subset=['face_center_x', 'face_center_y'])
        if not subset.empty:
            plot_heatmap(subset, f"Heatmap for {col}={val}")

In [None]:
# --- Post hoc ---

all_results_filename = 'post_all.tex'
relevant_results_filename = 'post_relevant.tex'

all_results_img_filename = f'post_figure_all.tex'
relevant_results_img_filename = f'post_figure_relevant.tex'

section_latex_all = "\\section{Post-hoc Test Results (All)}\n\n"
section_latex_relevant = "\\section{Post-hoc Test Results (Relevant)}\n\n"
with open(os.path.join(output_dir, all_results_filename), 'w') as f:
    f.write(section_latex_all)
with open(os.path.join(output_dir, relevant_results_filename), 'w') as f:
    f.write(section_latex_relevant)


with open(os.path.join(output_dir, all_results_img_filename), 'w') as f:
    f.write(subsection_latex)
with open(os.path.join(output_dir, relevant_results_img_filename), 'w') as f:
    f.write(subsection_latex)



for cont_feature in continuous_features:
    significant_interactions = []


    # Iterate through all combinations of two categorical features
    for combo in combinations(categorical_features, 2):
        f1, f2 = combo
        formula = f"{cont_feature} ~ C({f1}) * C({f2})"

        # Ensure there are enough samples in each group for ANOVA
        if df.groupby([f1, f2]).size().min() > 1:
            try:
                model = ols(formula, data=df).fit()
                anova_table = sm.stats.anova_lm(model, typ=2)

                # Locate the interaction term
                interaction_term = f'C({f1}):C({f2})'
                if interaction_term in anova_table.index:
                    p_value = anova_table.loc[interaction_term, 'PR(>F)']



                    if p_value < 0.05:
                        significant_interactions.append((f1, f2, cont_feature, p_value))
            except Exception as e:
                print(e)

    print(cont_feature)
    print(significant_interactions)
    if significant_interactions:
        for f1, f2, measure, p_val in significant_interactions:


            # Perform post-hoc tests
            # Combine features into a single categorical variable
            # Avoid creating duplicate columns if this block is run multiple times
            combined_col_name = f'{f1}_{f2}'
            # Drop the combined column if it already exists to avoid errors
            if combined_col_name in df.columns:
                df = df.drop(columns=[combined_col_name])
            df[combined_col_name] = df[f1].astype(str) + '_' + df[f2].astype(str)


            # Perform Tukey's HSD test

            try:
                # Ensure there is more than one group with data for Tukey's test
                if df[combined_col_name].nunique() > 1:
                    tukey_results = pairwise_tukeyhsd(endog=df[measure].dropna(), groups=df.loc[df[measure].dropna().index, combined_col_name], alpha=0.05)


                    # Convert Tukey results to DataFrame for LaTeX
                    tukey_df_all = pd.DataFrame(data=tukey_results._results_table.data[1:], columns=tukey_results._results_table.data[0])
                    tukey_df_all = tukey_df_all.rename(columns={'meandiff': 'Mean Diff', 'p-adj': 'p-adj (Tukey)', 'lower': 'Lower CI', 'upper': 'Upper CI'})

                    # Save all Tukey results to LaTeX
                    subsection_latex = f"\\subsection{{Tukey's HSD for {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()} (All)}}\n\n"
                    with open(os.path.join(output_dir, all_results_filename), 'a') as f:
                        f.write(subsection_latex)
                    save_latex_table(
                        tukey_df_all.round(4),
                        caption=f"Tukey's HSD Results (All): {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()}",
                        label=f"tab:tukey_all:{f1}{f2}{measure}",
                        filename=all_results_filename,
                        mode='a',
                        img_label=f"fig:tukey_all",
                        latex_img_filename=all_results_img_filename,
                        figure_caption=f"Tukey's HSD Results (All): {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()}", 
                        all=True
                    )

                    # Save significant Tukey results to LaTeX
                    # Filter for both statistical significance AND a meaningful difference in means
                    match measure:
                        case Const.centrality:
                            mean_diff_threshold = 20
                        case Const.mean_depth:
                            mean_diff_threshold = 0.06
                        case Const.face_center_y:
                            mean_diff_threshold = 80
                        case _:
                            mean_diff_threshold = 0 # You can adjust this threshold based on what you consider a "large" difference

                    tukey_df_relevant = tukey_df_all[(tukey_df_all['reject']) & (abs(tukey_df_all['Mean Diff']) >= mean_diff_threshold)].copy()

                    if not tukey_df_relevant.empty:
                         subsection_latex = f"\\subsection{{Tukey's HSD for {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()} (Relevant)}}\n\n"
                         with open(os.path.join(output_dir, relevant_results_filename), 'a') as f:
                             f.write(subsection_latex)
                         save_latex_table(
                             tukey_df_relevant.round(4),
                             caption=f"Tukey's HSD Results (Relevant): {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()}",
                             label=f"tab:tukey_relevant:{f1.replace('_', '')}{f2.replace('_', '')}{measure}",
                             filename=relevant_results_filename,
                             mode='a',
                            img_label=f"fig:tukey_relevant",
                            latex_img_filename=relevant_results_img_filename,
                            figure_caption=f"Tukey's HSD Results (Relevant): {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()}",
                         )


                else:
                    print('else')
            except ValueError as e:
                print(e)


            # # Perform pairwise t-tests with Bonferroni correction

            # combined_groups = df[combined_col_name].unique()
            # p_values = []
            # comparisons = []
            # t_stats = [] # Store t-statistics
            # mean_diffs = [] # Store mean differences

            # for i in range(len(combined_groups)):
            #     for j in range(i+1, len(combined_groups)):
            #         group1 = df[df[combined_col_name] == combined_groups[i]][measure].dropna()
            #         group2 = df[df[combined_col_name] == combined_groups[j]][measure].dropna()
            #         if len(group1) > 1 and len(group2) > 1:
            #             t_stat, p_val = stats.ttest_ind(group1, group2)
            #             t_stats.append(t_stat) # Append t-statistic
            #             p_values.append(p_val)
            #             comparisons.append(f"{combined_groups[i]} vs {combined_groups[j]}")
            #             mean_diffs.append(group1.mean() - group2.mean()) # Append mean difference

            # if p_values:
            #     # Apply Bonferroni correction
            #     if len(p_values) > 0:
            #         reject, corrected_p_values, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

            #         # Convert pairwise t-test results to DataFrame for LaTeX
            #         pairwise_ttest_df_all = pd.DataFrame({
            #             'Comparison': comparisons,
            #             't-stat': t_stats, # Include t-statistics
            #             'Original p-value': p_values,
            #             'Bonferroni corrected p-value': corrected_p_values,
            #             'Reject H0 (alpha=0.05)': reject,
            #             'Mean Diff': mean_diffs # Include mean differences
            #         })

            #         #


            #         # Save all pairwise t-test results to LaTeX
            #         subsection_latex = f"\\subsection{{Pairwise T-tests (Bonferroni) for {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()} (All)}}\n\n"
            #         with open(os.path.join(output_dir, all_results_filename), 'a') as f:
            #             f.write(subsection_latex)
            #         save_latex_table(
            #             pairwise_ttest_df_all.round(4),
            #             caption=f"Pairwise T-tests (Bonferroni) Results (All): {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()}",
            #             label=f"tab:pairwise_ttest_all:{f1.replace('_', '')}{f2.replace('_', '')}{measure.replace('_', '')}",
            #             filename=all_results_filename,
            #             mode='a'
            #         )

            #         # Save significant pairwise t-test results to LaTeX
            #         # Filter for both statistical significance AND a meaningful difference in means
            #         mean_diff_threshold = 0 # You can adjust this threshold
            #         pairwise_ttest_df_relevant = pairwise_ttest_df_all[(pairwise_ttest_df_all['Reject H0 (alpha=0.05)']) & (abs(pairwise_ttest_df_all['Mean Diff']) >= mean_diff_threshold)].copy()

            #         if not pairwise_ttest_df_relevant.empty:
            #              subsection_latex = f"\\subsection{{Pairwise T-tests (Bonferroni) for {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()} (Relevant)}}\n\n"
            #              with open(os.path.join(output_dir, relevant_results_filename), 'a') as f:
            #                  f.write(subsection_latex)
            #              save_latex_table(
            #                  pairwise_ttest_df_relevant.round(4),
            #                  caption=f"Pairwise T-tests (Bonferroni) Results (Relevant): {f1.replace('_', ' ').title()} and {f2.replace('_', ' ').title()} on {measure.replace('_', ' ').title()}",
            #                  label=f"tab:pairwise_ttest_relevant:{f1.replace('_', '')}{f2.replace('_', '')}{measure.replace('_', '')}",
            #                  filename=relevant_results_filename,
            #                  mode='a'
            #              )


            #     else:
            #         print('else')
            # else:
            #     print('else')


with open(os.path.join(output_dir, all_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')
with open(os.path.join(output_dir, relevant_results_img_filename), 'a') as f:
    f.write('\\end{figure}\n\n')

In [None]:
image_files = [f for f in os.listdir(output_dir) if f.endswith('.svg') or f.endswith('.png')]
# print(image_files)
section_images = {}
for image_file in image_files:
    # Extract section name from filename (e.g., 'ttest' from 'tab:ttests_sig_emotionface_center_y.svg')
    # Assuming a consistent naming convention where the section name is after 'tab:' or at the beginning
    parts = image_file.split('_')
    if 'heatmap' not in parts[0]:
        section = parts[1]
    else:
        section = parts[0] # Handle heatmap filenames

    if section not in section_images:
        section_images[section] = []
    section_images[section].append(image_file)


In [None]:
for section, imagess in section_images.items():
    images_lst = [imagess[i:i + 8] for i in range(0, len(imagess), 8)]
        
    filename = os.path.join(output_dir, f"{section}_figures.tex")
    with open(filename, 'w') as f:
        # Add necessary packages
        f.write(f"\\section{{{section.replace('_', ' ').title()} Figures}}\n\n") # Add section title
        for img_ind, images in enumerate(images_lst):

            # Begin figure environment
            f.write(f"\\begin{{figure}}[label={{fig:{section.replace('_', '')}:{img_ind}}}, caption={{All {section.replace('_', ' ').title()} Results}}]\n")
            f.write("\\centering\n")

            for ind, image in enumerate(images):
                # Determine subfigure caption and label from filename
                # Remove the section prefix and file extension
                subfigure_label_raw = image.replace(f"{section}_", "").replace(".svg", "").replace(".png", "")
                subfigure_label = subfigure_label_raw.replace(":", "").replace("-", "").replace("_", "") # Remove special characters for label
                subfigure_caption = subfigure_label_raw.replace("_", " ").replace(":", " - ") # Replace underscores and colon for caption

                # Write subfigure environment
                f.write("\\begin{subfigure}{0.48\\textwidth}\n") # Use textwidth for single column figures
                if '.svg' in image:
                    f.write(f"\\includesvg[width=\\textwidth]{{{os.path.join('figures/results/', image)}}}\n") # Include path to image
                else:
                    f.write(f"\\includegraphics[width=\\textwidth]{{{os.path.join('figures/results/', image)}}}\n") # Include path to image
                f.write(f"\\caption{{{subfigure_caption}}}\n")
                f.write(f"\\label{{fig:{section.replace('_', '')}_{subfigure_label}}}\n")
                f.write("\\end{subfigure}\n")
                if (ind + 1) % 2 == 0:
                    f.write("\\hfill\n") # Add vertical space between subfigures

            # End figure environment
            f.write("\\end{figure}\n")