<a href="https://colab.research.google.com/github/tomheston/A-Comparison-of-the-Relative-Risk-Index-with-Unit-Fragilty/blob/main/UFI_vs_FI_vs_RRI_for_Significant_P_Values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#########################################################################
# This program calculates the correlations between the fragility index, #
# the unit fragility index, the relative risk index, and the p-value    #
# for a standard 2x2 contingency table were the base p-value is < 0.05  #
#########################################################################
# Thomas F. Heston                                                      #
# GNU GPL v3.0                                                          #
# Please provide a citation to this program or the associated           #
# manuscript if you find this useful. Thank you!                        #
# https://faculty.washington.edu/theston                                #
#########################################################################

import pandas as pd
from scipy.stats import fisher_exact, pearsonr
from scipy.stats import chi2_contingency
from scipy.stats import norm
import numpy as np

# set variables
lowran = 20 # lowest number in the 2x2 table
highestran = 480 # highest number in 2x2 table
toppv=0.05 # highest p-value
lowpv=0.000999 # lowest p-value
cases = 20 # how many cases to run

# Function to generate data and compute p-values
def generate_data_and_pvalues(rows=cases):
    data = []
    # start loop
    while len(data) < rows:
        highran = np.random.randint(lowran+1, highestran)
        ao, bo, co, do = [int(x) for x in np.random.randint(lowran, highran, 4)]
        zero_found = False
        if ao == bo or ao == co or ao == do or bo == co or bo == do or co == do:
          continue
        # use fishers exact test to calculate p-values
        _, pv = fisher_exact([[ao, bo], [co, do]])
        #
        if lowpv < pv < toppv:
            #
            # A weakness of the UFI and FI is the confusion over which cell
            # to modify if more than one cell = min or max, therefore
            # we will only analyze tables with nonequal cell values.
            #
            if ao == bo or ao == co or ao == do or bo == co or bo == do or co == do:
              continue
            #
            smallest = min(ao, bo, co, do)
            # Add retry logic
            retry_count = 0
            max_retries = 100
            # calculate the FI
            fi = 1
            pv1 = 0
            max_iter = 100
            count = 0
            while pv1 < toppv and count < max_iter and retry_count < max_retries:
                if smallest == ao:
                    a1, b1, c1, d1 = ao+fi, bo-fi, co, do
                elif smallest == bo:
                    a1, b1, c1, d1 = ao-fi, bo+fi, co, do
                elif smallest == co:
                    a1, b1, c1, d1 = ao, bo, co+fi, do-fi
                elif smallest == do:
                    a1, b1, c1, d1 = ao, bo, co-fi, do+fi
                # if the smallest cell goes to zero then UFI/FI calculations
                # break, so throw out these cases
                if any(val == 0 for val in [a1, b1, c1, d1]):
                    zero_found = True  # Set the flag to True if zero found
                    retry_count += 1
                    if retry_count > max_retries:
                        continue
                    smallest = min(ao, bo, co, do)
                    continue
                _, pv1 = fisher_exact([[a1, b1], [c1, d1]])
                if pv1 < toppv:
                    fi += 1
                count += 1
                retry_count += 1

            # Reset retry count
            if fi > 99:
              continue
            retry_count = 0
            ufi=1
            pv2 = 0
            max_iter = 100
            count = 0
            while pv2 < toppv and count < max_iter and retry_count < max_retries:
                if smallest in (ao, do):
                    a2, b2, c2, d2 = ao+ufi, bo-ufi, co-ufi, do+ufi
                elif smallest in (bo, co):
                    a2, b2, c2, d2 = ao-ufi, bo+ufi, co+ufi, do-ufi
                #
                # if the smallest cell goes to zero then UFI/FI calculations
                # break, so throw out these cases
                if any(val == 0 for val in [a2, b2, c2, d2]):
                    zero_found = True  # Set the flag to True if zero found
                    retry_count += 1
                    if retry_count > max_retries:
                        continue
                    smallest = min(ao, bo, co, do)
                    continue
                # calculate change in p-value (pv1)
                _, pv2 = fisher_exact([[a2, b2], [c2, d2]]) # pv= the p-value
                #
                if pv2 < toppv:
                    ufi += 1
                count += 1
                retry_count += 1

            if zero_found:  # Check the flag after the inner loop
              continue  # Continue the outer loop if zero found

            ppv1 = ao / (ao + bo)
            ppv2 = co / (co + do)
            total = ao + bo + co + do
            size = total
            FQ1 = fi / total # correct fi for sample size
            FQ2 = ufi / total # correct ufi for sample size

            # RRI calculations - note how much simpler this calculation is!
            RRI = abs((bo*co-ao*do)/total) # this is the relative risk index formula
            pRRI = RRI / total # percent RRI = the RRI corrected for sample size

            # the following are used to manually verify the database
            ao2, bo2, co2, do2 = ao, bo, co, do
            if ppv1>ppv2:
              ppv3a=(ao2-RRI)/(ao2+bo2)
              ppv3b=(co2+RRI)/(co2+do2)
            elif ppv1<ppv2:
              ppv3a=(ao2+RRI)/(ao2+bo2)
              ppv3b=(co2-RRI)/(co2+do2)
            data.append([ao, bo, co, do, pv, a1, b1, c1, d1, pv1, fi, FQ1, a2, b2, c2, d2, pv2, ufi, FQ2, ppv1, ppv2, ppv3a, ppv3b, RRI, pRRI, size])

    columns = ['ao', 'bo', 'co', 'do', 'pv', 'a1', 'b1', 'c1', 'd1', 'pv1', 'fi', 'FQ1', 'a2', 'b2', 'c2', 'd2', 'pv2', 'ufi', 'FQ2', 'ao2', 'bo2', 'co2', 'do2', 'ppv1', 'ppv2', 'ppv3a', 'ppv3b', 'RRI','pRRI','size']
    df = pd.DataFrame(data, columns=columns)
    df['pv'] = df['pv'].round(7)
    df['pv1'] = df['pv1'].round(7)

    # Correlations with p-value
    corr_fi_pv, p_value_fi_pv = [round(val, 5) for val in pearsonr(df['fi'], df['pv'])]
    corr_FQ1_pv, p_value_FQ1_pv = [round(val, 5) for val in pearsonr(df['FQ1'], df['pv'])]
    corr_ufi_pv, p_value_ufi_pv = [round(val, 5) for val in pearsonr(df['ufi'], df['pv'])]
    corr_FQ2_pv, p_value_FQ2_pv = [round(val, 5) for val in pearsonr(df['FQ2'], df['pv'])]
    corr_RRI_pv, p_value_RRI_pv = [round(val, 5) for val in pearsonr(df['RRI'], df['pv'])]
    corr_pRRI_pv, p_value_pRRI_pv = [round(val, 5) for val in pearsonr(df['pRRI'], df['pv'])]

    # Calculate the averages
    avgfi = df['fi'].mean()
    avgufi = df['ufi'].mean()
    avgFQ1 = df['FQ1'].mean()
    avgFQ2 = df['FQ2'].mean()
    avgRRI = df['RRI'].mean()
    avgpRRI = df['pRRI'].mean()
    avgsize = df['size'].mean()

    # Print out base settings
    print("BASE SETTINGS")
    print("Total Cases Tested:", cases)
    print("Lowest number in 2x2 table:", lowran)
    print("Highest number in 2x2 table:", highestran)
    print("Lowest p-value:", lowpv)
    print("Highest p-value:", toppv)
    print("Average Sample Size:", avgsize)
    print()

    # Print out correlation results with p-values
    print("CORRELATIONS")
    print("Correlation between FI and pv:", corr_fi_pv, "avg:", avgfi)
    print("Correlation between UFI and pv:", corr_ufi_pv, "avg:", avgufi)
    print("Correlation between FQ1 and pv:", corr_FQ1_pv, "avg:", avgFQ1)
    print("Correlation between FQ2 and pv:", corr_FQ2_pv, "avg:", avgFQ2)
    print("Correlation between RRI and pv:", corr_RRI_pv, "avg:", avgRRI)
    print("Correlation between pRRI and pv:", corr_pRRI_pv, "avg:", avgpRRI)

    return df

# Generate the DataFrame
df = generate_data_and_pvalues()

# Print the DataFrame to a file and Export from Colab (optional)
df.to_csv('2x2_tables.csv', index=False)
from google.colab import files
files.download('2x2_tables.csv')


BASE SETTINGS
Total Cases Tested: 20
Lowest number in 2x2 table: 20
Highest number in 2x2 table: 480
Lowest p-value: 0.000999
Highest p-value: 0.05
Average Sample Size: 588.05

CORRELATIONS
Correlation between FI and pv: -0.83754 avg: 6.05
Correlation between UFI and pv: -0.85507 avg: 3.7
Correlation between FQ1 and pv: -0.63863 avg: 0.014470478299147582
Correlation between FQ2 and pv: -0.6833 avg: 0.00871500776131191
Correlation between RRI and pv: -0.312 avg: 14.272926397727128
Correlation between pRRI and pv: -0.42271 avg: 0.0305892862327517


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>