# Statistical Analyses of Counts
NOTE: Requires `scipy>=1.15.0` to use Fisher's Exact Test for r $\times$ c contingency tables greater than $2 \times 2$

In [20]:
from collections import defaultdict
import glob
import os
from pathlib import PurePath
import re

import numpy as np
import pandas as pd
from scipy.stats import fisher_exact, MonteCarloMethod, PermutationMethod
from scipy.stats import false_discovery_control

In [2]:
notebook_save_name = "1.1-outcome-subgroup-stats:"

In [3]:
output_dir = 'output/'
output_files = glob.glob(f'{output_dir}*_counts.csv')
output_files

['output/1.0-subgroup-comparison:maternal_race_ethnicity_counts.csv',
 'output/1.0-subgroup-comparison:maternal_race_ethnicity_topk_counts.csv',
 'output/1.0-subgroup-comparison:paternal_race_ethnicity_counts.csv',
 'output/1.0-subgroup-comparison:paternal_race_ethnicity_topk_counts.csv',
 'output/1.0-subgroup-comparison:infant_sex_counts.csv',
 'output/1.0-subgroup-comparison:infant_sex_topk_counts.csv',
 'output/1.0-subgroup-comparison:maternal_age_counts.csv',
 'output/1.0-subgroup-comparison:maternal_age_topk_counts.csv']

In [4]:
compare_types = ['maternal_race_ethnicity', 'paternal_race_ethnicity', 'infant_sex', 'maternal_age']
outcome_compare = dict()
topk_compare = dict()

for f in output_files:
    if 'topk' in f:
        topk_compare[f] = pd.read_csv(f)
    else:
        outcome_compare[f] = pd.read_csv(f)

In [5]:
# Test case with outcome
df = outcome_compare['output/1.0-subgroup-comparison:maternal_race_ethnicity_counts.csv']

In [None]:
contingency_table = pd.crosstab(
    index=df['outcome_label'],
    columns=df['maternal_race_ethnicity'],
    values=df['count'],
    aggfunc='sum'
)

# Replace NA with the average value of the column to be able to do the exact test
contingency_table.fillna(contingency_table.mean(), inplace=True)
# contingency_table

In [None]:
# stat is probablility density of table under null hypothesis + fixed marginals
stat, p_val = fisher_exact(contingency_table)
# stat, p_val

In [None]:
def apply_fisher_exact(df, compare_group_col, outcome_label_col='outcome_label', count_col='count') -> pd.DataFrame:
        contingency_table = pd.crosstab(
            index=df[outcome_label_col],
            columns=df[compare_group_col],
            values=df[count_col],
            aggfunc='sum'
        )

        if contingency_table.shape != (2, 2):
            rng = np.random.default_rng(seed=101)
            method = MonteCarloMethod(rng=rng)
        else:
            method = None

        if contingency_table.isnull().any().any():
            contingency_table = contingency_table.fillna(contingency_table.mean())
            stat, pval = fisher_exact(contingency_table.to_numpy(), method=method)
        else:
            stat, pval = fisher_exact(contingency_table.to_numpy(), method=method)

        return pd.DataFrame(
        {'comparison': [compare_group_col],
            'Fisher_exact_statistic': [stat],
            'Fisher_exact_pval': [pval]
        })

In [9]:
# Use groupby and apply to return one stat and P value pr outcome and comparison column (e.g., maternal_race_ethnicity)
df.groupby('outcome').apply(lambda x: apply_fisher_exact(x, compare_group_col='maternal_race_ethnicity'), include_groups=False).reset_index().drop(columns=['level_1'])

Unnamed: 0,outcome,comparison,Fisher_exact_statistic,Fisher_exact_pval
0,BPD,maternal_race_ethnicity,4.275457e-11,0.0454
1,IVH,maternal_race_ethnicity,3.16712e-12,0.0053
2,NEC,maternal_race_ethnicity,1.669181e-20,0.0001
3,ROP,maternal_race_ethnicity,3.257577e-12,0.0083


In [11]:
all_comparisons = []

for file_name, df in outcome_compare.items():
    compare_column = re.search(':(.*)_counts.csv', file_name, re.IGNORECASE).group(1)
    df = (df.groupby('outcome')
          .apply(lambda x: apply_fisher_exact(x, compare_group_col=compare_column), include_groups=False)
          .reset_index()
          .drop(columns=['level_1']))
    all_comparisons.append(df)

all_comparisons = pd.concat(all_comparisons, ignore_index=True)

In [None]:
all_comparisons['Bonferroni_adjusted_pval'] = np.minimum(all_comparisons['Fisher_exact_pval'] * len(all_comparisons), 1.0)
# all_comparisons

In [13]:
all_comparisons.to_csv(f'output/{notebook_save_name}outcome_comparison_counts_pvals.csv', index=False)

## Comparisons of within top-K groups, within outcomes

In [None]:
topk_df = topk_compare['output/1.0-subgroup-comparison:maternal_age_topk_counts.csv']
topk_df.in_top_k_subgroups = topk_df.in_top_k_subgroups.astype(int)
# topk_df

In [None]:
res = (topk_df.groupby(['outcome', 'outcome_label'])
       .apply(lambda x: apply_fisher_exact(x, compare_group_col='maternal_age', outcome_label_col='in_top_k_subgroups'), include_groups=False)
       .reset_index()
       .drop(columns='level_2'))
# res

In [16]:
all_topk_comparisons = []

for file_name, df in topk_compare.items():
    compare_column = re.search(':(.*)_topk_counts.csv', file_name, re.IGNORECASE).group(1)
    df = (df.groupby(['outcome', 'outcome_label'])
          .apply(lambda x: apply_fisher_exact(x, compare_group_col=compare_column, outcome_label_col='in_top_k_subgroups'), include_groups=False)
          .reset_index()
          .drop(columns=['level_2']))
    all_topk_comparisons.append(df)

all_topk_comparisons = pd.concat(all_topk_comparisons, ignore_index=True)

In [None]:
all_topk_comparisons['Bonferroni_adjusted_pval'] = np.minimum(all_topk_comparisons['Fisher_exact_pval'] * len(all_topk_comparisons), 1.0)
# all_topk_comparisons

In [19]:
all_topk_comparisons.to_csv(f'output/{notebook_save_name}outcome_comparison_topk_counts_pvals.csv', index=False)